def intracluster_dist(cluster_paths): distances = [] for i in range(len(cluster_paths)-1): path1 = cluster_paths[i] path2 = cluster_paths[i+1] dirOf_path1 = path_utilities.remove_path_end(path1) dirOf_path2 = path_utilities.remove_path_end(path2) distances.append(path_dist(dirOf_path1, dirOf_path2)) dists = np.array(distances) return np.mean(dists)
def initialize_output_location(dataset_path): # setup output path as "file_place" outside the repo p = Path(Path(os.getcwd()).parent).parent dataset_name = path_utilities.get_last_dir_from_path(dataset_path) file_place = os.path.join(p, "cluster-datalake-outputs/", dataset_name + "--output") if not os.path.isdir(path_utilities.remove_path_end(file_place)): os.mkdir(path_utilities.remove_path_end(file_place)) if not os.path.isdir(file_place): os.mkdir(file_place) return dataset_name, file_place
def runflow(num_clusters, retokenize, recluster, dataset_path, minibatch): if retokenize.lower() == 'y': retokenize = "1" else: retokenize = "0" if recluster.lower() == 'y': recluster = "1" else: recluster = "0" if minibatch.lower() == 'y': minibatch = "1" home_dir = path_utilities.remove_path_end(dataset_path) corpusdir = os.path.join( home_dir, "converted-" + path_utilities.get_last_dir_from_path(dataset_path) ) #eg. /home/ljung/converted/ # record initial time that program started t0 = time() fr, all_cluster_words, distinct_cluster_labels = main_function( num_clusters, retokenize, recluster, corpusdir, dataset_path, 10, minibatch) bar_clusters(fr, distinct_cluster_labels, num_clusters, home_dir, dataset_path) print_cluster_stats(fr, all_cluster_words, dataset_path, num_clusters) # print total time taken to run program print("\nTime taken: ", time() - t0, " seconds\n")
def handle_compressed(comp_paths, dest): num_comp = len(comp_paths) output_dir = os.path.join(dest, "compressed") if not os.path.isdir(output_dir): os.mkdir(output_dir) for path in comp_paths: filename = path_utilities.get_fname_from_path(path) p = subprocess.call([ "gzip -d -q -k -f", filename, ">", os.path.join(dest, path_utilities.remove_extension(filename)) ], cwd=path_utilities.remove_path_end(path), shell=True) p.wait() p2 = subprocess.call( ["rm", path_utilities.remove_extension(filename)], cwd=path_utilities.remove_path_end(path), shell=True) p2.wait()
def bar_clusters(frame, distinct_cluster_labels, num_clusters, home_dir, dataset_path): plt.figure("bar") matplotlib.rcParams.update({'font.size': 4}) dataset_name, file_place = initialize_output_location(dataset_path) trailer_text = dataset_name + "_k=" + str(num_clusters) print("\n\nGenerating barcharts...") pdf = matplotlib.backends.backend_pdf.PdfPages(os.path.join(file_place, "text_barcharts_" + trailer_text + ".pdf")) cluster_directories = [] # for each cluster, generate a bar chart for i in tqdm(distinct_cluster_labels): # We allow for the case where some clusters are missing plt.clf() paths_in_cluster = {} # get the files associated with the current cluster cluster_files = frame.loc[frame['cluster']==i] for index, row in cluster_files.iterrows(): path = path_utilities.remove_path_end(row['filename']) # if the path is already in the cluster, add to count if path in paths_in_cluster: paths_in_cluster.update({path:paths_in_cluster.get(path)+1}) else: paths_in_cluster.update({path:1}) cluster_directories.append(paths_in_cluster) sorted_names = [] sorted_counts = [] # sort the paths in ascending order based on # of occurrences for e in sorted(paths_in_cluster, key=paths_in_cluster.get, reverse=True): trimmed_name = e[len(home_dir):] sorted_names.append(trimmed_name) sorted_counts.append(paths_in_cluster[e]) cluster_stats = get_cluster_stats(paths_in_cluster) y_pos = np.arange(len(sorted_names)) plt.bar(y_pos, sorted_counts, align='center', alpha=0.5) plt.xticks(y_pos, sorted_names, rotation=90) plt.rc('xtick') plt.ylabel('Number of files') plt.xlabel('Directory') plt.title('Directories in Cluster ' + str(i) + "\n" + cluster_stats) plt.tight_layout() plt.subplots_adjust(bottom=0.38, top=0.87) pdf.savefig(plt.gcf()) pdf.close() np.save(os.path.join(file_place, "cluster_directories_" + trailer_text + ".npy"), cluster_directories) print("Bar charts written to \"text_barcharts_" + trailer_text + ".pdf\"")
def shuffle(dataset_path): if (confirm(prompt="Warning, this will scramble the directory " + "structure of all files and folders in " + dataset_path + ". Are you sure you want to do this? ")): print("Ok.") else: exit() if (confirm(prompt="Really sure, though?")): print("Ok.") else: exit() if (confirm(prompt="Super duper sure???")): print("Ok.") else: exit() # get a list of the paths to every file in the dataset # rooted at "dataset_path" filepaths = DFS.DFS(dataset_path) num_files = len(filepaths) # list of the parent directories of every file in # "filepaths". directory_list = [] # for each file for filepath in filepaths: # get its parent directory directory = remove_path_end(filepath) # and add it to our list of parent directories directory_list.append(directory) # generate a permutation of the number of files perm = np.random.permutation(num_files) # for each index for i in range(num_files): # get the image of the index under our permutation permuted_index = perm[i] # get the file we're moving next_file = filepaths[i] # get the randomly chosen destination directory dest_dir = directory_list[permuted_index] # move the file print(next_file) os.system("mv " + next_file + " " + dest_dir)
def generate_results(filename_header_pairs, labels, num_clusters, dataset_path, write_path, dataset_name): #=================================================================== #=#BLOCK#=#: Generates two data structures: # "list_cluster_lists": list of lists, each list contains # the filepaths for one cluster. # "cluster_directories": list of dicts, one per cluster, # keys are unique directories, values are counts #=================================================================== print("Creating list of filepaths for each cluster. ") print("Creating list of dicts which " + "map directories to frequencies. ") # create a dict mapping cluster indices to lists of filepaths cluster_filepath_dict = {} # list of lists, each list is full of the filepaths for one cluster. list_cluster_lists = [] # list of dicts, keys are unique directories, values are counts # each list corresponds to a cluster cluster_directories = [] # initialize each child list. for k in range(num_clusters): list_cluster_lists.append([]) # add k empty dicts cluster_directories.append({}) # for each label in labels for i in tqdm(range(len(labels))): # get the corresponding filename filename_header_pair = filename_header_pairs[i] filename = filename_header_pair[0] # transform "@" delimiters to "/" filename = path_utilities.str_decode(filename) # remove the actual filename to get its directory decoded_filepath = path_utilities.remove_path_end(filename) # get common prefix of top level dataset directory common_prefix = path_utilities.remove_path_end(dataset_path) # remove the common prefix for display on barchart. The " - 1" # is so that we include the leading "/". len_pre = len(common_prefix) len_decod = len(decoded_filepath) decoded_filepath_trunc = decoded_filepath[len_pre - 1:len_decod] # add it to the appropriate list based on the label list_cluster_lists[labels[i]].append(decoded_filepath_trunc) # create a list of dicts, one for each cluster, which map dirs to # counts. for k in range(num_clusters): for directory in list_cluster_lists[k]: if directory in cluster_directories[k]: old_count = cluster_directories[k].get(directory) new_count = old_count + 1 cluster_directories[k].update({directory: new_count}) else: cluster_directories[k].update({directory: 1}) #=================================================================== #=#BLOCK#=#: Prints cluster information to .pdf and .txt files. #=================================================================== print("Printing cluster info to .txt and .pdf files. ") # get a list of the cluster statistic for printing to pdf cluster_stats = get_stats.get_cluster_stats(cluster_directories) # compute silhouette coefficients for each cluster (sil_list) # and for the entire clustering (sil) sil, sil_list = compute_silhouette(cluster_directories, dataset_path) l = 0 for coeff in sil_list: # print("Silhouette score for cluster " + str(l)+": "+str(coeff)) l += 1 print("Total silhouette for entire clustering: ", sil) # get the frequency drop score of the clusters fd_scores, fd_total = compute_freqdrop_score(cluster_directories) freqdrop_total = fd_total freqdrop_scores = fd_scores # get the naive tree dist score of the clusters td_scores, td_total = compute_naive_score(list_cluster_lists, cluster_directories) # just make font a bit smaller matplotlib.rcParams.update({'font.size': 4}) print("\n\nGenerating barcharts...") # open the pdf and text files for writing pdf_path = os.path.join( write_path, "structured_stats_" + dataset_name + "_k=" + str(num_clusters) + ".pdf") txt_path = os.path.join( write_path, "structured_stats_" + dataset_name + "_k=" + str(num_clusters) + ".txt") pkl_path = os.path.join( write_path, "histogram_data_" + dataset_name + "_k=" + str(num_clusters) + ".pkl") pdf = PdfPages(pdf_path) f = open(txt_path, 'w') # save list_cluster_lists to a pkl file with open(pkl_path, 'wb') as filehandle: pickle.dump(list_cluster_lists, filehandle) # for each cluster for k in range(num_clusters): single_cluster_stats = cluster_stats[k] #fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(10, 20)) #plt.figure(k) plt.clf() # get frequencies of the paths path_counts = Counter(list_cluster_lists[k]) # Create a dataframe from path_counts df = pd.DataFrame.from_dict(path_counts, orient='index') # rename the frequency axis df = df.rename(columns={df.columns[0]: "freqs"}) # sort it with highest freqs on top sorted_df = df.sort_values("freqs", ascending=False) top_10_slice = sorted_df.head(10) top_10_slice.plot(kind='bar') # leave enough space for x-axis labels # fig.subplots_adjust(hspace=7) fig_title = ("Directory distribution for cluster " + str(k) + "\n" + "Number of unique directories: " + str(single_cluster_stats[0]) + "\n" + "Mean frequency: " + str(single_cluster_stats[1]) + "\n" + "Median frequency: " + str(single_cluster_stats[3]) + "\n" + "Standard deviation of frequencies: " + str(single_cluster_stats[2]) + "\n" + "Closest common ancestor of all directories: " + single_cluster_stats[4] + "\n" + "Silhouette score: " + str(sil_list[k]) + "\n" + "Frequency drop score: " + str(freqdrop_scores[k])) plt.title(fig_title) plt.xlabel('Directory') plt.ylabel('Quantity of files in directory') plt.tight_layout() plt.subplots_adjust(bottom=0.38, top=0.87) pdf.savefig(plt.gcf()) # print to .txt file as well f.write(fig_title) f.write("\n\n") # setting ensemble to just freqdrop ensemble_score = ((sil + 1) / 2 + freqdrop_total) / 2 scores = [] sil = (sil + 1) / 2 scores.append(freqdrop_total) scores.append(sil) scores.append(td_total) f.write("Total_silhouette: " + str(sil)) f.write("Total_frequency drop: " + str(freqdrop_total)) f.write("Total ensemble score: " + str(ensemble_score)) f.write("Total naive score: " + str(td_total)) f.close() pdf.close() return list_cluster_lists, scores
def shuffle(dataset_path, shuffle_ratio, warning, old_shuffle_tracker, filepaths): if warning == True: if (confirm(prompt="Warning, this will scramble the directory " + "structure of all files and folders in " + dataset_path + ". Are you sure you want to do this? ")): print("Ok.") else: exit() if (confirm(prompt="Really sure, though?")): print("Ok.") else: exit() if (confirm(prompt="Super duper sure???")): print("Ok.") else: exit() # get a list of the paths to every file in the dataset # rooted at "dataset_path" new_filepaths = DFS.DFS(dataset_path) num_files = len(new_filepaths) print("Number of files: ", num_files) while len(old_shuffle_tracker) < len(filepaths): old_shuffle_tracker.append(0) # we randomly shuffle the list of filepaths num_to_shuffle = math.floor(num_files * shuffle_ratio) print(num_to_shuffle) # only shuffle part of the dataset paths_to_shuffle = new_filepaths[0:num_to_shuffle] # generate a permutation of the number of files perm = np.random.permutation(num_to_shuffle) perm2 = np.random.permutation(num_to_shuffle) # "num_to_shuffle" randomly chosen parent directories directory_list = [] # for each file for i in range(num_to_shuffle): # get the image of the index under our permutation permuted_index = perm[i] # get its parent directory directory = remove_path_end(new_filepaths[permuted_index]) # and add it to our list of parent directories directory_list.append(directory) # moves a random file somewhere in "directory_list" for i in range(num_to_shuffle): # get the image of the index under our permutation permuted_index2 = perm2[i] # get the file we're moving next_file = "iiiiiiiiiiiiiiiiii" files_checked = 0 while old_shuffle_tracker[permuted_index2] == 1: next_file = new_filepaths[permuted_index2] files_checked += 1 if files_checked > 2000: break # get the randomly chosen destination directory dest_dir = directory_list[i] # move the file, only if dest dir isn't parent of next_file if remove_path_end(next_file) != dest_dir: os.system("mv \"" + next_file + "\" \"" + dest_dir + "\"") # create shuffle tracker shuffled_DFS = DFS.DFS(dataset_path) shuffle_tracker = [] for i in range(min([len(shuffled_DFS), len(filepaths)])): if shuffled_DFS[i] != filepaths[i]: shuffle_tracker.append(1) else: shuffle_tracker.append(old_shuffle_tracker[i]) return shuffle_tracker