def get_document_contents(directory, dataset_path): dataset_name, file_place = initialize_output_location(dataset_path) # load in the extension index file from the output folder ext_dict_file_loc = os.path.join( file_place, "extension_index_" + dataset_name + ".npy") ext_paths = np.load(ext_dict_file_loc).item() # "filenames" list of the paths of files # "data" list of the contents of files filenames = [] data = [] i = 1 # get contents of txt files still in original dataset txt_paths = ext_paths.get("txt") print("Getting .txt contents from " + dataset_path) for path in tqdm(txt_paths): if os.path.isfile(path): i = i + 1 # add the path of the file to "filenames" filenames.append(path) # read the contents of the file and remove newlines fread = open(path, "r", errors='backslashreplace') contents = fread.read() fread.close() contents = contents.replace("\n", "") # add the string of the contents of the file to "data" data.append(contents) # get contents of converted files in the other directory conv_folders = path_utilities.get_immediate_subdirectories(directory) # for each folder in the directory (e.g. pdf/ doc/) for folder in conv_folders: filetype = path_utilities.get_last_dir_from_path(folder) if filetype in ["pdf", "doc", "docx"]: #, "xml", "html"]: print("Getting ." + folder + " contents") for filename in tqdm(os.listdir(folder)): cur_file = os.path.join(folder, filename) if os.path.isfile(cur_file): i = i + 1 # add the non-converted filename to "filenames" new_name = path_utilities.str_decode( path_utilities.remove_extension(filename)) filenames.append(new_name) # read the contents of the file and remove newlines fread = open(cur_file, "r", errors='backslashreplace') contents = fread.read() fread.close() contents = contents.replace("\n", "") # add the string of the file contents to "data" data.append(contents) print("Num total files: ", i) print("All directory contents retrieved") return filenames, data
def generate_results(filename_header_pairs, labels, num_clusters, dataset_path, write_path, dataset_name): #=================================================================== #=#BLOCK#=#: Generates two data structures: # "list_cluster_lists": list of lists, each list contains # the filepaths for one cluster. # "cluster_directories": list of dicts, one per cluster, # keys are unique directories, values are counts #=================================================================== print("Creating list of filepaths for each cluster. ") print("Creating list of dicts which " + "map directories to frequencies. ") # create a dict mapping cluster indices to lists of filepaths cluster_filepath_dict = {} # list of lists, each list is full of the filepaths for one cluster. list_cluster_lists = [] # list of dicts, keys are unique directories, values are counts # each list corresponds to a cluster cluster_directories = [] # initialize each child list. for k in range(num_clusters): list_cluster_lists.append([]) # add k empty dicts cluster_directories.append({}) # for each label in labels for i in tqdm(range(len(labels))): # get the corresponding filename filename_header_pair = filename_header_pairs[i] filename = filename_header_pair[0] # transform "@" delimiters to "/" filename = path_utilities.str_decode(filename) # remove the actual filename to get its directory decoded_filepath = path_utilities.remove_path_end(filename) # get common prefix of top level dataset directory common_prefix = path_utilities.remove_path_end(dataset_path) # remove the common prefix for display on barchart. The " - 1" # is so that we include the leading "/". len_pre = len(common_prefix) len_decod = len(decoded_filepath) decoded_filepath_trunc = decoded_filepath[len_pre - 1:len_decod] # add it to the appropriate list based on the label list_cluster_lists[labels[i]].append(decoded_filepath_trunc) # create a list of dicts, one for each cluster, which map dirs to # counts. for k in range(num_clusters): for directory in list_cluster_lists[k]: if directory in cluster_directories[k]: old_count = cluster_directories[k].get(directory) new_count = old_count + 1 cluster_directories[k].update({directory: new_count}) else: cluster_directories[k].update({directory: 1}) #=================================================================== #=#BLOCK#=#: Prints cluster information to .pdf and .txt files. #=================================================================== print("Printing cluster info to .txt and .pdf files. ") # get a list of the cluster statistic for printing to pdf cluster_stats = get_stats.get_cluster_stats(cluster_directories) # compute silhouette coefficients for each cluster (sil_list) # and for the entire clustering (sil) sil, sil_list = compute_silhouette(cluster_directories, dataset_path) l = 0 for coeff in sil_list: # print("Silhouette score for cluster " + str(l)+": "+str(coeff)) l += 1 print("Total silhouette for entire clustering: ", sil) # get the frequency drop score of the clusters fd_scores, fd_total = compute_freqdrop_score(cluster_directories) freqdrop_total = fd_total freqdrop_scores = fd_scores # get the naive tree dist score of the clusters td_scores, td_total = compute_naive_score(list_cluster_lists, cluster_directories) # just make font a bit smaller matplotlib.rcParams.update({'font.size': 4}) print("\n\nGenerating barcharts...") # open the pdf and text files for writing pdf_path = os.path.join( write_path, "structured_stats_" + dataset_name + "_k=" + str(num_clusters) + ".pdf") txt_path = os.path.join( write_path, "structured_stats_" + dataset_name + "_k=" + str(num_clusters) + ".txt") pkl_path = os.path.join( write_path, "histogram_data_" + dataset_name + "_k=" + str(num_clusters) + ".pkl") pdf = PdfPages(pdf_path) f = open(txt_path, 'w') # save list_cluster_lists to a pkl file with open(pkl_path, 'wb') as filehandle: pickle.dump(list_cluster_lists, filehandle) # for each cluster for k in range(num_clusters): single_cluster_stats = cluster_stats[k] #fig, ax = plt.subplots(nrows=1, ncols=1,figsize=(10, 20)) #plt.figure(k) plt.clf() # get frequencies of the paths path_counts = Counter(list_cluster_lists[k]) # Create a dataframe from path_counts df = pd.DataFrame.from_dict(path_counts, orient='index') # rename the frequency axis df = df.rename(columns={df.columns[0]: "freqs"}) # sort it with highest freqs on top sorted_df = df.sort_values("freqs", ascending=False) top_10_slice = sorted_df.head(10) top_10_slice.plot(kind='bar') # leave enough space for x-axis labels # fig.subplots_adjust(hspace=7) fig_title = ("Directory distribution for cluster " + str(k) + "\n" + "Number of unique directories: " + str(single_cluster_stats[0]) + "\n" + "Mean frequency: " + str(single_cluster_stats[1]) + "\n" + "Median frequency: " + str(single_cluster_stats[3]) + "\n" + "Standard deviation of frequencies: " + str(single_cluster_stats[2]) + "\n" + "Closest common ancestor of all directories: " + single_cluster_stats[4] + "\n" + "Silhouette score: " + str(sil_list[k]) + "\n" + "Frequency drop score: " + str(freqdrop_scores[k])) plt.title(fig_title) plt.xlabel('Directory') plt.ylabel('Quantity of files in directory') plt.tight_layout() plt.subplots_adjust(bottom=0.38, top=0.87) pdf.savefig(plt.gcf()) # print to .txt file as well f.write(fig_title) f.write("\n\n") # setting ensemble to just freqdrop ensemble_score = ((sil + 1) / 2 + freqdrop_total) / 2 scores = [] sil = (sil + 1) / 2 scores.append(freqdrop_total) scores.append(sil) scores.append(td_total) f.write("Total_silhouette: " + str(sil)) f.write("Total_frequency drop: " + str(freqdrop_total)) f.write("Total ensemble score: " + str(ensemble_score)) f.write("Total naive score: " + str(td_total)) f.close() pdf.close() return list_cluster_lists, scores