def intracluster_dist(cluster_paths):
    distances = []
    for i in range(len(cluster_paths)-1):
        path1 = cluster_paths[i]
        path2 = cluster_paths[i+1]
        dirOf_path1 = path_utilities.remove_path_end(path1)
        dirOf_path2 = path_utilities.remove_path_end(path2)
        distances.append(path_dist(dirOf_path1, dirOf_path2))
    dists = np.array(distances)
    return np.mean(dists)
Пример #2
0
def initialize_output_location(dataset_path):
    # setup output path as "file_place" outside the repo
    p = Path(Path(os.getcwd()).parent).parent
    dataset_name = path_utilities.get_last_dir_from_path(dataset_path)
    file_place = os.path.join(p, "cluster-datalake-outputs/", dataset_name + "--output")    
    
    if not os.path.isdir(path_utilities.remove_path_end(file_place)):
        os.mkdir(path_utilities.remove_path_end(file_place)) 

    if not os.path.isdir(file_place):
        os.mkdir(file_place)
    return dataset_name, file_place
def runflow(num_clusters, retokenize, recluster, dataset_path, minibatch):

    if retokenize.lower() == 'y':
        retokenize = "1"
    else:
        retokenize = "0"

    if recluster.lower() == 'y':
        recluster = "1"
    else:
        recluster = "0"

    if minibatch.lower() == 'y':
        minibatch = "1"

    home_dir = path_utilities.remove_path_end(dataset_path)
    corpusdir = os.path.join(
        home_dir,
        "converted-" + path_utilities.get_last_dir_from_path(dataset_path)
    )  #eg. /home/ljung/converted/

    # record initial time that program started
    t0 = time()

    fr, all_cluster_words, distinct_cluster_labels = main_function(
        num_clusters, retokenize, recluster, corpusdir, dataset_path, 10,
        minibatch)
    bar_clusters(fr, distinct_cluster_labels, num_clusters, home_dir,
                 dataset_path)
    print_cluster_stats(fr, all_cluster_words, dataset_path, num_clusters)

    # print total time taken to run program
    print("\nTime taken: ", time() - t0, " seconds\n")
def handle_compressed(comp_paths, dest):
    num_comp = len(comp_paths)
    output_dir = os.path.join(dest, "compressed")
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)
    for path in comp_paths:
        filename = path_utilities.get_fname_from_path(path)
        p = subprocess.call([
            "gzip -d -q -k -f", filename, ">",
            os.path.join(dest, path_utilities.remove_extension(filename))
        ],
                            cwd=path_utilities.remove_path_end(path),
                            shell=True)
        p.wait()
        p2 = subprocess.call(
            ["rm", path_utilities.remove_extension(filename)],
            cwd=path_utilities.remove_path_end(path),
            shell=True)
        p2.wait()
Пример #5
0
def bar_clusters(frame, distinct_cluster_labels, num_clusters, home_dir, dataset_path):
    plt.figure("bar")
  
    matplotlib.rcParams.update({'font.size': 4})
    
    dataset_name, file_place = initialize_output_location(dataset_path)
    trailer_text = dataset_name + "_k=" + str(num_clusters)
    
    print("\n\nGenerating barcharts...")    
    pdf = matplotlib.backends.backend_pdf.PdfPages(os.path.join(file_place, "text_barcharts_" + trailer_text + ".pdf"))
    cluster_directories = []
    # for each cluster, generate a bar chart 
    for i in tqdm(distinct_cluster_labels):
        # We allow for the case where some clusters are missing 
        plt.clf()
        paths_in_cluster = {}
        # get the files associated with the current cluster
        cluster_files = frame.loc[frame['cluster']==i]
        for index, row in cluster_files.iterrows():
            path = path_utilities.remove_path_end(row['filename'])
            # if the path is already in the cluster, add to count
            if path in paths_in_cluster:
                paths_in_cluster.update({path:paths_in_cluster.get(path)+1})
            else:
                paths_in_cluster.update({path:1})
        cluster_directories.append(paths_in_cluster)
        sorted_names = []
        sorted_counts = []
        # sort the paths in ascending order based on # of occurrences
        for e in sorted(paths_in_cluster, key=paths_in_cluster.get, reverse=True):
            trimmed_name = e[len(home_dir):]
            sorted_names.append(trimmed_name)
            sorted_counts.append(paths_in_cluster[e])

        cluster_stats = get_cluster_stats(paths_in_cluster)

        y_pos = np.arange(len(sorted_names))
        plt.bar(y_pos, sorted_counts, align='center', alpha=0.5)
        plt.xticks(y_pos, sorted_names, rotation=90)
        plt.rc('xtick')
        plt.ylabel('Number of files')
        plt.xlabel('Directory')
        plt.title('Directories in Cluster ' + str(i) + "\n" + cluster_stats)
        plt.tight_layout()
        plt.subplots_adjust(bottom=0.38, top=0.87)
        
        pdf.savefig(plt.gcf())
    pdf.close()
    np.save(os.path.join(file_place, "cluster_directories_" + trailer_text + ".npy"), cluster_directories)
    print("Bar charts written to \"text_barcharts_" + trailer_text + ".pdf\"")
def shuffle(dataset_path):
    if (confirm(prompt="Warning, this will scramble the directory " +
                "structure of all files and folders in " + dataset_path +
                ". Are you sure you want to do this? ")):
        print("Ok.")
    else:
        exit()
    if (confirm(prompt="Really sure, though?")):
        print("Ok.")
    else:
        exit()
    if (confirm(prompt="Super duper sure???")):
        print("Ok.")
    else:
        exit()
    # get a list of the paths to every file in the dataset
    # rooted at "dataset_path"
    filepaths = DFS.DFS(dataset_path)
    num_files = len(filepaths)
    # list of the parent directories of every file in
    # "filepaths".
    directory_list = []
    # for each file
    for filepath in filepaths:
        # get its parent directory
        directory = remove_path_end(filepath)
        # and add it to our list of parent directories
        directory_list.append(directory)
    # generate a permutation of the number of files
    perm = np.random.permutation(num_files)
    # for each index
    for i in range(num_files):
        # get the image of the index under our permutation
        permuted_index = perm[i]
        # get the file we're moving
        next_file = filepaths[i]
        # get the randomly chosen destination directory
        dest_dir = directory_list[permuted_index]
        # move the file
        print(next_file)
        os.system("mv " + next_file + " " + dest_dir)
def generate_results(filename_header_pairs, labels, num_clusters, dataset_path,
                     write_path, dataset_name):

    #===================================================================
    #=#BLOCK#=#: Generates two data structures:
    #            "list_cluster_lists": list of lists, each list contains
    #            the filepaths for one cluster.
    #            "cluster_directories": list of dicts, one per cluster,
    #            keys are unique directories, values are counts
    #===================================================================
    print("Creating list of filepaths for each cluster. ")
    print("Creating list of dicts which " + "map directories to frequencies. ")

    # create a dict mapping cluster indices to lists of filepaths
    cluster_filepath_dict = {}

    # list of lists, each list is full of the filepaths for one cluster.
    list_cluster_lists = []

    # list of dicts, keys are unique directories, values are counts
    # each list corresponds to a cluster
    cluster_directories = []

    # initialize each child list.
    for k in range(num_clusters):
        list_cluster_lists.append([])

        # add k empty dicts
        cluster_directories.append({})

    # for each label in labels
    for i in tqdm(range(len(labels))):

        # get the corresponding filename
        filename_header_pair = filename_header_pairs[i]
        filename = filename_header_pair[0]

        # transform "@" delimiters to "/"
        filename = path_utilities.str_decode(filename)

        # remove the actual filename to get its directory
        decoded_filepath = path_utilities.remove_path_end(filename)

        # get common prefix of top level dataset directory
        common_prefix = path_utilities.remove_path_end(dataset_path)

        # remove the common prefix for display on barchart. The " - 1"
        # is so that we include the leading "/".
        len_pre = len(common_prefix)
        len_decod = len(decoded_filepath)
        decoded_filepath_trunc = decoded_filepath[len_pre - 1:len_decod]

        # add it to the appropriate list based on the label
        list_cluster_lists[labels[i]].append(decoded_filepath_trunc)

    # create a list of dicts, one for each cluster, which map dirs to
    # counts.
    for k in range(num_clusters):
        for directory in list_cluster_lists[k]:
            if directory in cluster_directories[k]:
                old_count = cluster_directories[k].get(directory)
                new_count = old_count + 1
                cluster_directories[k].update({directory: new_count})
            else:
                cluster_directories[k].update({directory: 1})

    #===================================================================
    #=#BLOCK#=#: Prints cluster information to .pdf and .txt files.
    #===================================================================
    print("Printing cluster info to .txt and .pdf files. ")

    # get a list of the cluster statistic for printing to pdf
    cluster_stats = get_stats.get_cluster_stats(cluster_directories)

    # compute silhouette coefficients for each cluster (sil_list)
    # and for the entire clustering (sil)
    sil, sil_list = compute_silhouette(cluster_directories, dataset_path)
    l = 0
    for coeff in sil_list:
        # print("Silhouette score for cluster " + str(l)+": "+str(coeff))
        l += 1
    print("Total silhouette for entire clustering: ", sil)

    # get the frequency drop score of the clusters
    fd_scores, fd_total = compute_freqdrop_score(cluster_directories)
    freqdrop_total = fd_total
    freqdrop_scores = fd_scores

    # get the naive tree dist score of the clusters
    td_scores, td_total = compute_naive_score(list_cluster_lists,
                                              cluster_directories)

    # just make font a bit smaller
    matplotlib.rcParams.update({'font.size': 4})
    print("\n\nGenerating barcharts...")

    # open the pdf and text files for writing
    pdf_path = os.path.join(
        write_path, "structured_stats_" + dataset_name + "_k=" +
        str(num_clusters) + ".pdf")
    txt_path = os.path.join(
        write_path, "structured_stats_" + dataset_name + "_k=" +
        str(num_clusters) + ".txt")
    pkl_path = os.path.join(
        write_path,
        "histogram_data_" + dataset_name + "_k=" + str(num_clusters) + ".pkl")
    pdf = PdfPages(pdf_path)
    f = open(txt_path, 'w')

    # save list_cluster_lists to a pkl file
    with open(pkl_path, 'wb') as filehandle:
        pickle.dump(list_cluster_lists, filehandle)

    # for each cluster
    for k in range(num_clusters):
        single_cluster_stats = cluster_stats[k]

        #fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(10, 20))
        #plt.figure(k)
        plt.clf()

        # get frequencies of the paths
        path_counts = Counter(list_cluster_lists[k])

        # Create a dataframe from path_counts
        df = pd.DataFrame.from_dict(path_counts, orient='index')

        # rename the frequency axis
        df = df.rename(columns={df.columns[0]: "freqs"})

        # sort it with highest freqs on top
        sorted_df = df.sort_values("freqs", ascending=False)
        top_10_slice = sorted_df.head(10)
        top_10_slice.plot(kind='bar')

        # leave enough space for x-axis labels
        # fig.subplots_adjust(hspace=7)

        fig_title = ("Directory distribution for cluster " + str(k) + "\n" +
                     "Number of unique directories: " +
                     str(single_cluster_stats[0]) + "\n" + "Mean frequency: " +
                     str(single_cluster_stats[1]) + "\n" +
                     "Median frequency: " + str(single_cluster_stats[3]) +
                     "\n" + "Standard deviation of frequencies: " +
                     str(single_cluster_stats[2]) + "\n" +
                     "Closest common ancestor of all directories: " +
                     single_cluster_stats[4] + "\n" + "Silhouette score: " +
                     str(sil_list[k]) + "\n" + "Frequency drop score: " +
                     str(freqdrop_scores[k]))
        plt.title(fig_title)
        plt.xlabel('Directory')
        plt.ylabel('Quantity of files in directory')
        plt.tight_layout()
        plt.subplots_adjust(bottom=0.38, top=0.87)
        pdf.savefig(plt.gcf())

        # print to .txt file as well
        f.write(fig_title)
        f.write("\n\n")

    # setting ensemble to just freqdrop
    ensemble_score = ((sil + 1) / 2 + freqdrop_total) / 2
    scores = []
    sil = (sil + 1) / 2
    scores.append(freqdrop_total)
    scores.append(sil)
    scores.append(td_total)
    f.write("Total_silhouette: " + str(sil))
    f.write("Total_frequency drop: " + str(freqdrop_total))
    f.write("Total ensemble score: " + str(ensemble_score))
    f.write("Total naive score: " + str(td_total))
    f.close()
    pdf.close()
    return list_cluster_lists, scores
def shuffle(dataset_path, shuffle_ratio, warning, old_shuffle_tracker,
            filepaths):
    if warning == True:
        if (confirm(prompt="Warning, this will scramble the directory " +
                    "structure of all files and folders in " + dataset_path +
                    ". Are you sure you want to do this? ")):
            print("Ok.")
        else:
            exit()
        if (confirm(prompt="Really sure, though?")):
            print("Ok.")
        else:
            exit()
        if (confirm(prompt="Super duper sure???")):
            print("Ok.")
        else:
            exit()

    # get a list of the paths to every file in the dataset
    # rooted at "dataset_path"
    new_filepaths = DFS.DFS(dataset_path)

    num_files = len(new_filepaths)
    print("Number of files: ", num_files)

    while len(old_shuffle_tracker) < len(filepaths):
        old_shuffle_tracker.append(0)

    # we randomly shuffle the list of filepaths
    num_to_shuffle = math.floor(num_files * shuffle_ratio)
    print(num_to_shuffle)

    # only shuffle part of the dataset
    paths_to_shuffle = new_filepaths[0:num_to_shuffle]

    # generate a permutation of the number of files
    perm = np.random.permutation(num_to_shuffle)
    perm2 = np.random.permutation(num_to_shuffle)

    # "num_to_shuffle" randomly chosen parent directories
    directory_list = []

    # for each file
    for i in range(num_to_shuffle):

        # get the image of the index under our permutation
        permuted_index = perm[i]

        # get its parent directory
        directory = remove_path_end(new_filepaths[permuted_index])

        # and add it to our list of parent directories
        directory_list.append(directory)

    # moves a random file somewhere in "directory_list"
    for i in range(num_to_shuffle):

        # get the image of the index under our permutation
        permuted_index2 = perm2[i]

        # get the file we're moving
        next_file = "iiiiiiiiiiiiiiiiii"
        files_checked = 0
        while old_shuffle_tracker[permuted_index2] == 1:
            next_file = new_filepaths[permuted_index2]
            files_checked += 1
            if files_checked > 2000:
                break

        # get the randomly chosen destination directory
        dest_dir = directory_list[i]

        # move the file, only if dest dir isn't parent of next_file
        if remove_path_end(next_file) != dest_dir:
            os.system("mv \"" + next_file + "\" \"" + dest_dir + "\"")

    # create shuffle tracker
    shuffled_DFS = DFS.DFS(dataset_path)
    shuffle_tracker = []
    for i in range(min([len(shuffled_DFS), len(filepaths)])):
        if shuffled_DFS[i] != filepaths[i]:
            shuffle_tracker.append(1)
        else:
            shuffle_tracker.append(old_shuffle_tracker[i])
    return shuffle_tracker