Пример #1
0
def analyze_days(headers):
    """ Creates a bar chart showing the number of emails sent per day"""
    util.log_print("Running Day of Week Analysis")
    days_of_week = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
    email_days = list(map(lambda x: x["Date"][0].split(",")[0], headers))
    day_counts = []
    for day in days_of_week:
        day_counts.append(email_days.count(day))

    # Display statistics
    util.display_stats(day_counts,
                       "Statistics for days on which emails are set:")

    # Configure bar chart
    num_days = len(days_of_week)
    ind = np.arange(num_days)
    bar_width = 0.6
    chart, ax = plt.subplots()
    rectangles = ax.bar(ind, day_counts, bar_width, color='r', alpha=0.6)
    ax.set_ylabel("Number of emails")
    ax.set_xlabel("Day")
    ax.set_title("Number of emails per day")
    ax.set_xticks(ind + bar_width / 20)
    ax.set_xticklabels(("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"))

    for rect in rectangles:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width() / 2.,
                1 * height,
                '%d' % int(height),
                ha='center',
                va='bottom')
    plt.savefig("../../res/images/days.png")
    plt.show()
Пример #2
0
def analyze_years(headers):
    """ Creates a line chart showing the number of emails sent per year """
    util.log_print("Running Year Analysis")
    email_years = list(
        map(
            lambda x: x["Date"][0].split(" ")[3].replace("2000", "x").replace(
                "000", "200").replace("x", "2000"), headers))
    unique_years = list(set(email_years))
    unique_years.sort()
    year_counts = []
    for year in unique_years:
        year_counts.append(email_years.count(year))

    # Display statistics
    util.display_stats(year_counts,
                       "Statistics for years in which emails are sent:")

    # Configure line chart
    unique_years = list(map(int, unique_years))
    plt.plot(unique_years, year_counts, color='r', alpha=0.6)
    plt.xlim(1979, 2044)
    plt.ylabel('Number of Emails')
    plt.xlabel('Year')
    plt.title('Number of emails per year')
    plt.savefig("../../res/images/years.png")
    plt.show()
Пример #3
0
def write_to_data_file(data_file_name, num_files):
    """ Retrieves data maps from write queue and appends it to specified data file. """
    global data_to_write_queue
    progress_bar = util.ProgressBar(num_files, 'Extracting headers', 73)
    counter = 0
    with open(data_file_name, FILE_WRITE_MODE) as data_file:
        while True:
            # Keep retrieving header data from write queue until timeout exception 
            # is raised (should mean that no more data will be added to the queue).
            try:
                data = data_to_write_queue.get(block=True, timeout=QUEUE_TIMEOUT)
                stringified_data = util.stringify_headers(data)
                data_file.write('{0}\n'.format(stringified_data))
                counter += 1
                # Update progress bar.
                progress_bar.update(counter)
            except queue.Empty:
                # Finish up the writing process.
                progress_bar.clean()
                util.log_print('{0} entries written'.format(counter))
                break
            except Exception as error:
                print(error)
                util.log_print('{0} entries written'.format(counter))
                break
Пример #4
0
def main():
    """ Setup script for header extraction and writing. """
    args = create_arg_parser().parse_args()
    initialise_global_variables(args.ignore_x_headers)
    start_time = time.time()
    acquire_headers_and_write(args.path, args.file)
    end_time = time.time()
    execution_time = (end_time - start_time - QUEUE_TIMEOUT) / 60
    util.log_print('{0:.4f} min'.format(execution_time))
Пример #5
0
def main():
    """ Setup for data cleaning. """
    args = create_arg_parser().parse_args()
    check_is_valid_file(args.file)
    initialise_global_variables(args.file)
    start_time = time.time()
    clean_data(args.file)
    end_time = time.time()
    execution_time = (end_time - start_time - QUEUE_TIMEOUT) / 60
    util.log_print('{0:.4f} min'.format(execution_time))
Пример #6
0
def read_headers(filename):
    """ Reads the headers from the specified file and returns them as a list """
    util.log_print("Reading Headers")
    headers = []
    counter = 0
    with open(filename) as file:
        with progressbar.ProgressBar(max_value=NUMBER_OF_HEADERS) as bar:
            for line in file:
                headers.append(json.loads(line))
                bar.update(counter)
                counter += 1
    return headers
Пример #7
0
def write_anaonymized_headers(current_file_name, headers_list):
    """ Writes anonymized headers to a file. """
    # Create new file name from given file name.
    new_file = util.create_new_data_file_name(current_file_name,
                                              ANON_FILE_NAME_ADDITION)
    counter = 0
    # Write header content to new file.
    with open(new_file, FILE_WRITE_MODE) as data_file:
        for headers in headers_list:
            data = util.stringify_headers(headers)
            data_file.write('{0}\n'.format(data))
            counter += 1
    util.log_print("{} entries anonymized".format(counter))
Пример #8
0
def get_email_file_names(directory):
    """ Recursively collect all file names in specified directory. """
    file_names = []
    # Create spinner to show that script is busy processing.
    spinner = util.Spinner('Seeking files in directory: "{0}"'.format(directory))
    spinner.start()
    # Perform a recursive walk in specified directory.
    for directory_path, directories, files in os.walk(directory):	
        for file_name in files:
            file_names.append(os.path.join(directory_path, file_name))
    spinner.stop()
    util.log_print('Found {0} files'.format(len(file_names)))
    return file_names
Пример #9
0
def read_headers(filename):
    """ Reads the headers from the specified file and returns them as a list """
    # Get number of entries in given dataset file.
    NUMBER_OF_HEADERS = util.file_line_count(filename)
    # Continue reading of headers.
    util.log_print("{} entries found".format(NUMBER_OF_HEADERS))
    headers = []
    counter = 0
    with open(filename) as file:
        with progressbar.ProgressBar(max_value=NUMBER_OF_HEADERS) as bar:
            for line in file:
                headers.append(json.loads(line))
                bar.update(counter)
                counter += 1
    return headers
Пример #10
0
def analyze_domains(headers, top):
    """ Creates a horizontal bar chart showing the number of emails sent by the top domains """
    util.log_print("Running Domain Analysis")
    valid_headers = list(
        filter(lambda h: len(h["From"][0].split("@")) == 2, headers))
    domains = list(
        map(lambda h: h["From"][0].split("@")[1].split(".")[0], valid_headers))
    unique_domains = set(domains)
    domain_counts = {}
    counter = 0
    with progressbar.ProgressBar(max_value=len(unique_domains)) as bar:
        for domain in unique_domains:
            domain_counts[domain] = domains.count(domain)
            bar.update(counter)
            counter += 1
    sorted_domain_counts = sorted(domain_counts.items(),
                                  key=operator.itemgetter(1))
    sorted_domain_counts.reverse()
    chart_domains = []
    chart_domain_counts = []
    print("Top {0} domains that sent emails:".format(top))
    for x in range(top):
        chart_domains.append(sorted_domain_counts[x][0])
        chart_domain_counts.append(sorted_domain_counts[x][1])
        # Print results
        print("{0}. {1} - {2} emails sent".format(x + 1,
                                                  sorted_domain_counts[x][0],
                                                  sorted_domain_counts[x][1]))
    # Draw horizontal bar chart
    plt.rcdefaults()
    fig, ax = plt.subplots()
    y_pos = np.arange(len(chart_domains))
    ax.barh(y_pos,
            chart_domain_counts,
            align='center',
            color='green',
            ecolor='black')
    ax.set_yticks(y_pos)
    ax.set_yticklabels(chart_domains)
    ax.invert_yaxis()  # labels read top-to-bottom
    ax.set_xlabel('Number of emails sent')
    ax.set_title('Emails Sent per Domain')

    plt.savefig("../../res/images/domainsA.png")
    plt.show()
Пример #11
0
def analyze_subjects(headers, words_to_strip):
    """ Creates a word cloud of all subjects"""
    util.log_print("Running Subject Analysis")
    # Map headers to subjects
    subjects = list(map(lambda h: h["Subject"][0], headers))
    text = " ".join(subjects)

    # Strip specified words from subjects
    for word in words_to_strip:
        text = text.replace(word, " ")

    # Generate and save world cloud
    word_cloud = WordCloud(width=1000, height=500,
                           stopwords=set(STOPWORDS)).generate(text)
    plt.figure(figsize=(15, 8))
    plt.imshow(word_cloud)
    plt.axis("off")
    plt.show()
    plt.imsave("../../res/images/sub.png", word_cloud)
Пример #12
0
def read_unclean_data_file(unclean_data_file_name):
    """ Reads unclean data file and assigns unclean data lines to worker pool. """
    # Setup a worker pool for cleaning.
    pool = Pool(POOL_PROCESSOR_COUNT)
    # Attempt to open the unclean data file for reading.
    try:
        with open(unclean_data_file_name, FILE_READ_MODE) as unclean_data_file:
            # Fetch every data line in the file.
            for data_line in unclean_data_file:
                # Assign an unclean data line for cleaning to a worker process.
                pool.apply_async(clean_data_line, args=(data_line, ))
    # If there was an error let the user know.
    except:
        util.log_print(
            'Error reading file: "{0}"'.format(unclean_data_file_name))
        pool.terminate()
        return
    # Wait for worker pool to finish up.
    pool.close()
    pool.join()
Пример #13
0
def get_max_senders(headers, top):
    """ Creates a bar chart showing the number of emails sent by the top senders """
    util.log_print("Running Max Senders Analysis")
    email_addresses = list(map(lambda h: h["From"][0].split("@")[0], headers))
    unique_addresses = list(set(email_addresses))
    address_counts = {}
    counter = 0
    with progressbar.ProgressBar(max_value=len(unique_addresses)) as bar:
        for address in unique_addresses:
            address_counts[address] = email_addresses.count(address)
            bar.update(counter)
            counter += 1
    sorted_address_counts = sorted(address_counts.items(),
                                   key=operator.itemgetter(1))
    sorted_address_counts.reverse()

    graph_emails = []
    graph_counts = []
    for x in range(top):
        graph_emails.append(sorted_address_counts[x][0])
        graph_counts.append(sorted_address_counts[x][1])

    # Display statistics
    util.display_stats(graph_counts, "Statistics for emails sent per person:")

    # Configure bar chart
    plt.tight_layout()
    num_emails = len(graph_emails)
    ind = np.arange(num_emails)
    bar_width = 0.6
    chart, ax = plt.subplots()
    rectangles = ax.bar(ind, graph_counts, bar_width, color='r', alpha=0.6)
    ax.set_ylabel("Number of emails")
    ax.set_xlabel("Sender")
    ax.set_title("Number of emails per sender")
    ax.set_xticks(ind + bar_width / 20)
    ax.set_xticklabels(graph_emails)
    plt.xticks(rotation=90)

    plt.savefig("../../res/images/senders.png")
    plt.show()
Пример #14
0
def clean_data(unclean_data_file_name):
    """ Orchestrates reading and writing data processes. """
    global number_of_data_lines
    # Report original number of data lines.
    pre_log_entry = 'Found {0} entries in "{1}"'.format(
        number_of_data_lines.value, unclean_data_file_name)
    util.log_print(pre_log_entry)
    # Create a separate process to read unclean data file.
    read_process = Process(target=read_unclean_data_file,
                           args=(unclean_data_file_name, ))
    read_process.start()
    # Write clean data in current process.
    clean_data_file_name = util.create_new_data_file_name(
        unclean_data_file_name, CLEAN_FILE_NAME_ADDITION)
    entries_written = write_clean_data_file(clean_data_file_name)
    # Report number of clean data lines.
    post_log_entry = '{0} entries written in "{1}"'.format(
        entries_written, clean_data_file_name)
    util.log_print(post_log_entry)
    # Wait for reading process to finish up.
    read_process.join()
Пример #15
0
def analyze_basic(headers):
    """ Perform basic analysis on the email data set """
    util.log_print("Performing Basic Analysis")
    # Check how many people sent emails to themselves
    sent_to_self_count = len(
        list(
            filter(lambda header: header["From"][0] in header["To"][0],
                   headers)))
    sent_to_self_percentage = round(sent_to_self_count / len(headers) * 100, 2)
    print("{0} Emails ({1}%) were sent from the senders to themselves".format(
        sent_to_self_count, sent_to_self_percentage))

    # Check how many emails were sent from the same domain
    valid_headers = list(
        filter(lambda h: len(h["From"][0].split("@")) == 2, headers))
    same_domain_count = len(
        list(
            filter(
                lambda d: d["From"][0].split("@")[1].split(".")[0] in " ".join(
                    d["To"]), valid_headers)))
    same_domain_percentage = round(same_domain_count / len(headers) * 100, 2)
    print("{0} Emails ({1}%) were sent from the same domain".format(
        same_domain_count, same_domain_percentage))

    # Check how many emails were sent to more than one recipient
    multiple_recipient_count = len(
        list(filter(lambda q: len(q["To"]) > 1, headers)))
    multiple_recipient_percentage = round(
        multiple_recipient_count / len(headers) * 100, 2)
    print("{0} Emails ({1}%) were sent to more than one recipient".format(
        multiple_recipient_count, multiple_recipient_percentage))

    # Check how many emails were sent to a single recipient
    single_recipient_count = len(
        list(filter(lambda q: len(q["To"]) == 1, headers)))
    single_recipient_percentage = round(
        single_recipient_count / len(headers) * 100, 2)
    print("{0} Emails ({1}%) were sent to only one recipient".format(
        single_recipient_count, single_recipient_percentage))
Пример #16
0
def analyze_content_types(headers, is_charset):
    """ Creates a pie chart of all content types or charsets"""
    if is_charset:
        util.log_print("Running Charset Analysis")
        content_types = list(
            map(lambda h: h["Content-Type"][1].split("=")[1], headers))
    else:
        util.log_print("Running Content Type Analysis")
        content_types = list(map(lambda h: h["Content-Type"][0], headers))

    unique_types = list(set(content_types))
    counts = []
    for t in unique_types:
        counts.append(unique_types.count(t))
    chart, ax1 = plt.subplots()
    ax1.pie(counts,
            labels=unique_types,
            autopct='%1.1f%%',
            shadow=True,
            startangle=90)
    ax1.axis('equal')
    plt.savefig("../../res/images/content" + str(is_charset) + ".png")
    plt.show()
Пример #17
0
def analyze_times(headers):
    """ Creates a line chart showing the number of emails sent per hour """
    util.log_print("Running Time Analysis")
    hours = list(
        map(lambda x: int(x["Date"][0].split(" ")[4].split(":")[0]), headers))
    unique_hours = list(set(hours))
    unique_hours.sort()
    hours_count = []
    for hour in unique_hours:
        hours_count.append(hours.count(hour))

    # Display statistics
    util.display_stats(hours_count,
                       "Statistics for hours in which emails are sent:")

    # Configure line chart
    plt.plot(unique_hours, hours_count, color='g', alpha=0.6)
    plt.xlim(0, 24)
    plt.ylabel('Number of Emails')
    plt.xlabel('Hour of Day')
    plt.title('Number of emails per hour')
    plt.savefig("../../res/images/hours.png")
    plt.show()
Пример #18
0
def analyze_months(headers):
    """ Creates a bar chart showing the number of emails sent per month """
    util.log_print("Running Month Analysis")
    months_of_year = [
        "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct",
        "Nov", "Dec"
    ]
    email_months = list(map(lambda x: x["Date"][0].split(" ")[2], headers))
    month_counts = []
    for month in months_of_year:
        month_counts.append(email_months.count(month))

    # Display statistics
    util.display_stats(month_counts,
                       "Statistics for months in which emails are sent:")

    # Configure bar chart
    num_days = len(months_of_year)
    ind = np.arange(num_days)
    bar_width = 0.6
    chart, ax = plt.subplots()
    rectangles = ax.bar(ind, month_counts, bar_width, color='b', alpha=0.6)
    ax.set_ylabel("Number of emails")
    ax.set_xlabel("Month")
    ax.set_title("Number of emails per month")
    ax.set_xticks(ind + bar_width / 20)
    ax.set_xticklabels(months_of_year)

    for rect in rectangles:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width() / 2.,
                1 * height,
                '%d' % int(height),
                ha='center',
                va='bottom')
    plt.savefig("../../res/images/months.png")
    plt.show()
Пример #19
0
def main():
    """ Setup for data analysis. """
    args = create_arg_parser().parse_args()
    # Read headers into list
    headers = read_headers(args.file)
    # Perform basic analysis on the full data set
    af.analyze_basic(headers)
    # Perform exploratory analysis on full data set
    util.log_print("Performing Analysis on the Full Data Set")
    bulk_analyze(headers)
    # Perform analysis on emails sent to multiple recipients
    util.log_print("Performing Analysis on Emails Sent to Multiple Recipients")
    multiple_rec_headers = list(filter(lambda h: len(h["To"]) > 1, headers))
    bulk_analyze(multiple_rec_headers)
    # Perform analysis on emails sent to a single recipient
    util.log_print("Performing Analysis on Emails Sent to a Single Recipient")
    single_rec_headers = list(filter(lambda h: len(h["To"]) == 1, headers))
    bulk_analyze(single_rec_headers)
Пример #20
0
    args = parser.parse_args()

    if args.dataset != 'threshold' and args.dataset != 'full' and args.dataset != 'gwac':
        raise Exception('dataset type not supported')

    set_global_variables(args.dataset)

    if not os.path.exists(data_path):
        os.makedirs(data_path)
    if not os.path.exists(pic_path):
        os.makedirs(pic_path)

    p = Pool()
    for sigma in sigma_range:
        p.apply_async(gen,
                      args=(sigma, args.dataset, step_in_second, step,
                            sin_amp_threshold, continuous,
                            sample_seconds_in_one_day, num_days,
                            sample_points_in_one_day, history_len, data_path,
                            pic_path, temp_path, temp_list, relative_amp_range,
                            A_range, T_range, phi_range))
    p.close()
    p.join()
    log_print("All subprocesses done.")

    #gen(0.0, args.dataset, step_in_second, step, sin_amp_threshold,
    #    continuous, sample_seconds_in_one_day, num_days,
    #    sample_points_in_one_day, history_len, data_path, pic_path,
    #    temp_path, temp_list, relative_amp_range, A_range, T_range,
    #    phi_range)
Пример #21
0
def check_is_valid_file(file_name):
    """ Check if the specified file exists. """
    # If the file does not exist, let the user know and exit the script.
    if not is_file(file_name):
        util.log_print('File "{0} does not exist"'.format(file_name))
        sys.exit()
Пример #22
0
def gen(sigma, dataset, step_in_second, step, sin_amp_threshold, continuous,
        sample_seconds_in_one_day, num_days, sample_points_in_one_day,
        history_len, data_path, pic_path, temp_path, temp_list,
        relative_amp_range, A_range, T_range, phi_range):

    gen_num = 0

    for temp_file in temp_list:
        with open(os.path.join(temp_path, temp_file)) as file_handle:
            data_frame = np.loadtxt(file_handle)
        temp_y = data_frame[:, 1]
        temp_amp = abs(temp_y.min())

        if dataset == 'threshold' or dataset == 'full':
            param_combinations = product(relative_amp_range, T_range,
                                         phi_range)
        elif dataset == 'gwac':
            param_combinations = product(A_range, T_range, phi_range)
        else:
            raise Exception('dataset type not supported')

        for A_or_relative_amp, T, phi in param_combinations:
            if dataset == 'threshold' or dataset == 'full':
                A = temp_amp / A_or_relative_amp
                if A <= 1.5:
                    gen_num += 1
            else:
                gen_num += 1

    log_print("Number of samples to be generated: %d" % gen_num)

    ind = 0

    for temp_file in temp_list:
        # 1.read gravitational microlensing signal template
        with open(os.path.join(temp_path, temp_file)) as file_handle:
            data_frame = np.loadtxt(file_handle)
        temp_x = data_frame[:, 0]
        temp_y = data_frame[:, 1]
        tE = float(temp_file.split('_')[0])

        # 2.cutting of template
        i = 0
        while True:
            if temp_y[i] != temp_y[i + 1]:
                break
            i += 1
        cutted_temp_x = temp_x[i:-(i + 1)]
        cutted_temp_y = temp_y[i:-(i + 1)]

        temp_len = len(cutted_temp_x)  # template length T'
        temp_amp = abs(cutted_temp_y.min())  # template amplitude A'

        if dataset == 'threshold' or dataset == 'full':
            param_combinations = product(relative_amp_range, T_range,
                                         phi_range)
        elif dataset == 'gwac':
            param_combinations = product(A_range, T_range, phi_range)
        else:
            raise Exception('dataset type not supported')

        for A_or_relative_amp, T, phi in param_combinations:
            # num of outliers
            num_outliers = 3

            # sample point numbers in a period
            sin_period = int(T * sample_points_in_one_day)

            if dataset == 'gwac':
                # relative saliency
                A = A_or_relative_amp
                relative_amp = temp_amp / A
            else:
                # amplitude of sine curve
                relative_amp = A_or_relative_amp
                A = temp_amp / A_or_relative_amp
                if A > 1.5:
                    continue

            # generate a single period of the background signal
            single_period_sin = A * np.sin(
                S.linspace(0 + phi, 2 * np.pi + phi, sin_period))

            # final length
            if dataset == 'full' or dataset == 'threshold':
                final_len = history_len + temp_len
            else:
                final_len = history_len

            dup_time = final_len // sin_period
            res_len = final_len % sin_period
            basic_sin = np.tile(single_period_sin, dup_time)
            res_sin = single_period_sin[:res_len]
            basic_sin = np.concatenate([basic_sin, res_sin], axis=0)

            if continuous:
                final_x = [i * step for i in range(final_len)]
                noised_sin = basic_sin + normal(0, sigma, len(basic_sin))
            else:
                uncontinuous_basic_sin = np.array([])

                final_x = []
                for day in range(num_days):
                    # The phase of first section is fixed for injecting ML signal
                    if day == 0:
                        random_start_ind = 0
                    else:
                        random_start_ind = randint(0, final_len // 2)

                    uncontinuous_basic_sin = np.concatenate([
                        uncontinuous_basic_sin,
                        basic_sin[random_start_ind:random_start_ind +
                                  sample_points_in_one_day]
                    ],
                                                            axis=0)

                    day_x = [
                        day + i * step for i in range(sample_points_in_one_day)
                    ]
                    final_x.extend(day_x)

                assert len(basic_sin) == len(uncontinuous_basic_sin)
                noised_sin = uncontinuous_basic_sin \
                             + normal(0, sigma, len(basic_sin))

            # We inject the ML signal into the first section of background signal
            # and then horizontallyreverse the signal to ensure that the ML signal
            # is overlapped on the desired phase of the background signal
            final_y = noised_sin
            final_y[:temp_len] = final_y[:temp_len] + cutted_temp_y
            final_y = final_y[::-1]

            assert len(final_y) == final_len
            assert len(final_x) == len(final_y)

            tE_col = np.tile(tE, final_len)
            temp_start_col = np.tile(final_x[-temp_len:][0], final_len)
            temp_end_col = np.tile(final_x[-temp_len:][-1], final_len)

            A_prime = temp_amp
            T_prime = cutted_temp_x[-1] - cutted_temp_x[0]

            # write data
            data_name = "%.3f_%.3f_%.3f_%.3f_%.3f_%.3f.dat" % (
                sigma, T_prime, T, A_prime, relative_amp, phi)

            with open(os.path.join(data_path, data_name), 'w') as data_file:
                for i in range(final_len):
                    # injecting outlier points
                    if randint(10000) == 0:
                        if num_outliers > 0:
                            final_y[i] = -A_prime + normal(0, sigma)
                            num_outliers -= 1

                    print("%5.8f    %5.2f    %5.3f    %5.8f    %5.8f" %
                          (final_x[i], final_y[i], tE_col[i],
                           temp_start_col[i], temp_end_col[i]),
                          file=data_file)

            # write image
            pic_name = "%.3f_%.3f_%.3f_%.3f_%.3f_%.3f.png" % (
                sigma, T_prime, T, A_prime, relative_amp, phi)

            pic_file = os.path.join(pic_path, pic_name)

            pylab.figure()
            pylab.plot(final_x, final_y, '.')

            pylab.title("%.3f_%.3f_%.3f_%.3f_%.3f_%.3f.png" %
                        (sigma, T_prime, T, A_prime, relative_amp, phi))

            pylab.ylim(final_y.max() + 0.1, final_y.min() - 0.1)
            pylab.ylabel('mag')
            pylab.xlabel('t-t0(day)')
            pylab.savefig(pic_file)
            pylab.close()

            ind += 1
            log_print("finished generating: %s/%s, %d samples generated" %
                      (data_path, data_name, ind))