def runflow(num_clusters, retokenize, recluster, dataset_path, minibatch):

    if retokenize.lower() == 'y':
        retokenize = "1"
    else:
        retokenize = "0"

    if recluster.lower() == 'y':
        recluster = "1"
    else:
        recluster = "0"

    if minibatch.lower() == 'y':
        minibatch = "1"

    home_dir = path_utilities.remove_path_end(dataset_path)
    corpusdir = os.path.join(
        home_dir,
        "converted-" + path_utilities.get_last_dir_from_path(dataset_path)
    )  #eg. /home/ljung/converted/

    # record initial time that program started
    t0 = time()

    fr, all_cluster_words, distinct_cluster_labels = main_function(
        num_clusters, retokenize, recluster, corpusdir, dataset_path, 10,
        minibatch)
    bar_clusters(fr, distinct_cluster_labels, num_clusters, home_dir,
                 dataset_path)
    print_cluster_stats(fr, all_cluster_words, dataset_path, num_clusters)

    # print total time taken to run program
    print("\nTime taken: ", time() - t0, " seconds\n")
def get_document_contents(directory, dataset_path):
    dataset_name, file_place = initialize_output_location(dataset_path)

    # load in the extension index file from the output folder
    ext_dict_file_loc = os.path.join(
        file_place, "extension_index_" + dataset_name + ".npy")
    ext_paths = np.load(ext_dict_file_loc).item()

    # "filenames" list of the paths of files
    # "data" list of the contents of files
    filenames = []
    data = []
    i = 1

    # get contents of txt files still in original dataset
    txt_paths = ext_paths.get("txt")
    print("Getting .txt contents from " + dataset_path)
    for path in tqdm(txt_paths):
        if os.path.isfile(path):
            i = i + 1
            # add the path of the file to "filenames"
            filenames.append(path)

            # read the contents of the file and remove newlines
            fread = open(path, "r", errors='backslashreplace')
            contents = fread.read()
            fread.close()
            contents = contents.replace("\n", "")
            # add the string of the contents of the file to "data"
            data.append(contents)

    # get contents of converted files in the other directory
    conv_folders = path_utilities.get_immediate_subdirectories(directory)
    # for each folder in the directory (e.g. pdf/ doc/)
    for folder in conv_folders:
        filetype = path_utilities.get_last_dir_from_path(folder)
        if filetype in ["pdf", "doc", "docx"]:  #, "xml", "html"]:
            print("Getting ." + folder + " contents")
            for filename in tqdm(os.listdir(folder)):
                cur_file = os.path.join(folder, filename)
                if os.path.isfile(cur_file):
                    i = i + 1
                    # add the non-converted filename to "filenames"
                    new_name = path_utilities.str_decode(
                        path_utilities.remove_extension(filename))
                    filenames.append(new_name)

                    # read the contents of the file and remove newlines
                    fread = open(cur_file, "r", errors='backslashreplace')
                    contents = fread.read()
                    fread.close()
                    contents = contents.replace("\n", "")
                    # add the string of the file contents to "data"
                    data.append(contents)

    print("Num total files: ", i)
    print("All directory contents retrieved")
    return filenames, data
def initialize_output_location(dataset_path):
    # setup output path as "file_place" outside the repo
    p = Path(Path(os.getcwd()).parent).parent
    dataset_name = path_utilities.get_last_dir_from_path(dataset_path)
    file_place = os.path.join(p, "cluster-datalake-outputs/",
                              dataset_name + "--output")

    if not os.path.isdir(file_place):
        os.mkdir(file_place)
    return dataset_name, file_place
Пример #4
0
def count_and_sort_exts(extensions, num_slices, write_path, dataset_path):

    # the name of the top-level directory of the dataset
    dataset_name = path_utilities.get_last_dir_from_path(dataset_path)

    # a dict mapping tokens to the count of how many times they appear
    ext_count_dict = {}
    # for each extension
    for ext in extensions:
        try:
            if ext[2] != "._" and ext[-1] != "~" and ext[0] != "_":
                # if the extension is already in our dict
                if ext in ext_count_dict.keys():
                    # grab the old count
                    old_count = ext_count_dict.get(ext)
                    # increment and update the count in the dict
                    ext_count_dict.update({ext: old_count + 1})
                else:
                    # otherwise, add a key,value pair with a count of 1
                    ext_count_dict.update({ext: 1})
        except IndexError:
            if ext.isalnum():
                # if the extension is already in our dict
                if ext in ext_count_dict.keys():
                    # grab the old count
                    old_count = ext_count_dict.get(ext)
                    # increment and update the count in the dict
                    ext_count_dict.update({ext: old_count + 1})
                else:
                    # otherwise, add a key,value pair with a count of 1
                    ext_count_dict.update({ext: 1})
    sorted_extensions = []
    sorted_counts = []

    fd = open(
        os.path.join(write_path, "all_ext_counts_" + dataset_name + ".txt"),
        "w")
    # for each extension in the dict, iterating from largest to smallest count
    for ext in sorted(ext_count_dict, key=ext_count_dict.get, reverse=True):
        # add the extension to a sorted list of extensions
        sorted_extensions.append(ext)
        # add the corresponding count to a list of counts
        sorted_counts.append(ext_count_dict[ext])
        fd.write(ext + ": " + str(ext_count_dict[ext]) + "\n")
    fd.close()

    f = open(os.path.join(write_path, "top_exts_" + dataset_name + ".txt"),
             'w')
    if (len(sorted_extensions) < num_slices):
        num_slices = len(sorted_extensions)
    for i in range(num_slices):
        f.write(sorted_extensions[i] + "\n")
    f.close()

    return sorted_extensions, sorted_counts
def runflow(dataset_path): 
   
    #===================================================================
    #=#BLOCK#=#: Get read and write paths for cluster functions 
    #===================================================================
    print("Getting read and write paths for cluster functions. ")  
  
    # check if the dataset location is a valid directory 
    check_valid_dir(dataset_path)
   
    # get its absolute path
    dataset_path = os.path.abspath(dataset_path)
    
    # the name of the top-level directory of the dataset
    dataset_name = get_last_dir_from_path(dataset_path)
     
    # get converted file location and output location
    out_dir = os.path.join(dataset_path, 
                           "../" + "converted-" + dataset_name)
    
    # define the write path for the entire program
    write_path = "../../cluster-datalake-outputs/" + dataset_name + "--output/"
    if not os.path.isdir(write_path):
        os.system("mkdir " + write_path)
    print("All results printing to " + write_path)
    
    # get absolute paths 
    out_dir = os.path.abspath(out_dir)
    write_path = os.path.abspath(write_path)
    
    # get the location of the extension index file
    print("Finding extension index file. ")
    ext_dict_file_loc = os.path.join(write_path, "extension_index_"
                                     + dataset_name + ".npy")
    
    # check if the above paths are valid 
    check_valid_dir(out_dir)
    check_valid_file(ext_dict_file_loc)
    
    # load the extension to path dict
    ext_to_paths_dict = np.load(ext_dict_file_loc).item()
    csv_path_list = []
    txt_path_list = []
    if "csv" in ext_to_paths_dict:
        csv_path_list = ext_to_paths_dict["csv"]
    if "txt" in ext_to_paths_dict:
        txt_path_list = ext_to_paths_dict["txt"]
 
    # location of files converted to csv format
    csv_dir = os.path.join(out_dir, "csv/")
    converted_csv_list = os.listdir(csv_dir)

    doc2vec(dataset_path, dataset_name, write_path, csv_path_list)

    return
Пример #6
0
def plot_extensions(dataset_path, num_extensions):
    
    allpaths = DFS.DFS(dataset_path) 
    p = Path(os.getcwd()).parent
    dataset_name = path_utilities.get_last_dir_from_path(dataset_path)
    write_path = os.path.join(p, "outputs/", dataset_name + "--output/")
    if not os.path.isdir(write_path):
        os.mkdir(write_path)

    # a list of all the file names (without the paths)
    filenames = []
    for path in allpaths:
        filenames.append(path_utilities.get_fname_from_path(path))
    filenames_no_ext, exts = remove_all_extensions(filenames) 
    plot_extension_pie(exts, num_extensions, write_path, dataset_path)

    '''
Пример #7
0
def plot_extension_pie(extensions, num_slices, write_path, dataset_path):

    sorted_tuple = count_and_sort_exts(extensions, num_slices, write_path,
                                       dataset_path)
    sorted_exts, sorted_counts = sorted_tuple
    print("Number of unique extensions: ", len(sorted_exts))
    dataset_name = path_utilities.get_last_dir_from_path(dataset_path)
    labels = []
    sizes = []
    if (len(sorted_exts) < num_slices):
        num_slices = len(sorted_exts)
    for x in range(num_slices):
        labels.append(sorted_exts[x])
        sizes.append(sorted_counts[x])

    plt.figure("pie")
    plt.clf()
    plt.pie(sizes, labels=labels)
    plt.axis('equal')
    plt.title(str(num_slices) + " Most Common Extensions in ")
    pie_path = os.path.join(write_path,
                            "top_exts_pie_" + dataset_name + ".png")
    plt.savefig(pie_path, dpi=300)
Пример #8
0
def extension_indexer(dataset_path, n, write_path):
    allpaths = DFS(dataset_path)

    # a list of all the filenames (without paths)
    filenames = []
    for path in allpaths:
        filenames.append(path_utilities.get_fname_from_path(path))
    filenames_no_ext, exts = extensions.remove_all_extensions(filenames)

    sorted_tuple = extensions.count_and_sort_exts(exts, n, write_path,
                                                  dataset_path)
    sorted_exts, sorted_counts = sorted_tuple
    top_n_exts = sorted_exts  #UNCOMMENT FOR ONLY CONVERTING TOP N EXTS [:n]

    # makes a dictionary key for each of the top extensions
    ext_locations = {}
    for extension in top_n_exts:
        ext_locations.update({extension: []})

    # checks every file and saves the paths of those with the top extensions
    # in a dict called "ext_locations"
    for fp in allpaths:
        fn = path_utilities.get_fname_from_path(fp)
        if fn[:2] != "._":
            ext = path_utilities.get_single_extension(fn)
            if ext in top_n_exts:
                ext_list = ext_locations.get(ext)
                ext_list.append(fp)
                ext_locations.update({ext: ext_list})

    dataset_name = path_utilities.get_last_dir_from_path(dataset_path)
    ext_write_path = os.path.join(write_path,
                                  "extension_index_" + dataset_name + ".npy")
    np.save(ext_write_path, ext_locations)

    return ext_locations
Пример #9
0
def runflow(dataset_path, num_clusters, overwrite, overwrite_plot,
            fill_threshold):

    #===================================================================
    #=#: Get read and write paths for cluster functions
    #===================================================================
    print("Getting read and write paths for cluster functions. ")

    if overwrite == 'y' or overwrite == 'Y':
        overwrite = "1"
    if overwrite_plot == 'y' or overwrite_plot == 'Y':
        overwrite_plot = "1"

    # check if the dataset location is a valid directory
    check_valid_dir(dataset_path)

    # get its absolute path
    dataset_path = os.path.abspath(dataset_path)

    # the name of the top-level directory of the dataset
    dataset_name = get_last_dir_from_path(dataset_path)

    # get converted file location and output location
    out_dir = os.path.join(dataset_path, "../" + "converted-" + dataset_name)

    # define the write path for the entire program
    write_path = "../../cluster-datalake-outputs/" + dataset_name + "--output/"
    if not os.path.isdir(write_path):
        os.system("mkdir " + write_path)
    print("All results printing to " + write_path)

    # get absolute paths
    out_dir = os.path.abspath(out_dir)
    write_path = os.path.abspath(write_path)

    if not os.path.isdir(out_dir):
        os.system("mkdir " + out_dir)
    print("Converting to " + out_dir)

    # get the location of the extension index file
    print("Finding extension index file. ")
    ext_dict_file_loc = os.path.join(
        write_path, "extension_index_" + dataset_name + ".npy")
    # check if the above paths are valid
    check_valid_dir(out_dir)
    check_valid_file(ext_dict_file_loc)

    # load the extension to path dict
    ext_to_paths_dict = np.load(ext_dict_file_loc).item()
    csv_path_list = []
    if "csv" in ext_to_paths_dict:
        csv_path_list = ext_to_paths_dict["csv"]

    # location of files converted to csv format
    csv_dir = os.path.join(out_dir, "csv/")

    #===================================================================
    #=#: Generates the files needed for clustering, clusters,
    #    and prints various results.
    #===================================================================

    # if csvs have less than fill_threshold*100% nonempty cells in
    # every row then we throw them out of our clustering.

    # we have two dicts, one made up of files which were converted to
    # csv format, and the other made up of files that were in csv
    # format originally. we concatenate both dicts into "header_dict".

    # Get the combined header dict
    header_dict_converted = get_header_dict(csv_dir, [], fill_threshold, True)
    header_dict_csv = get_header_dict("", csv_path_list, fill_threshold, False)
    header_dict = dict(header_dict_converted)
    header_dict.update(header_dict_csv)

    # Get the file/header array, distance matrix
    dist_tuple = dist_mat_generator(header_dict, write_path, overwrite,
                                    dataset_name)
    jacc_matrix, filename_header_pairs = dist_tuple

    # cluster, generate labels
    labels = agglomerative(jacc_matrix, num_clusters, filename_header_pairs,
                           overwrite_plot, write_path, dataset_name)

    # plot in 3D
    print("Plotting clusters in R^3. ")
    plot_clusters(jacc_matrix, labels, write_path, overwrite_plot,
                  dataset_name, num_clusters)

    # generate results in pdf and text files
    print("Generating results. ")
    list_cluster_lists, scores = generate_results(filename_header_pairs,
                                                  labels, num_clusters,
                                                  dataset_path, write_path,
                                                  dataset_name)

    # get a table of the most common attributes in each cluster
    print("Getting cluster attributes. ")
    clust_attr_lists = get_cluster_attributes(filename_header_pairs, labels,
                                              num_clusters, write_path,
                                              dataset_name)
    return scores
def convert(dataset_path, num_top_exts, num_processes):

    check_valid_dir(dataset_path)

    # the name of the top-level directory of the dataset
    dataset_name = path_utilities.get_last_dir_from_path(dataset_path)

    # get its absolute path
    dataset_path = os.path.abspath(dataset_path)
    #dest = os.path.join(dataset_path, "../finalconverted-" + dataset_name + "/")
    dest = os.path.join(
        Path(dataset_path).parent, "converted-" + dataset_name + "/")
    if not os.path.isdir(dest):
        os.system("mkdir " + dest)
    check_valid_dir(dest)

    # get the script output location
    write_path = os.path.join("../../cluster-datalake-outputs/" +
                              dataset_name + "--output/")

    # get its absolute path
    write_path = os.path.abspath(write_path)
    if not os.path.isdir(write_path):
        os.system("mkdir ../../cluster-datalake-outputs/")
        os.system("mkdir " + write_path)
    check_valid_dir(write_path)

    # create the destination directories for converted files.
    csv_dest = os.path.join(dest, "csv/")
    print("csv_dest: ", csv_dest)
    if not os.path.isdir(csv_dest):
        os.system("mkdir " + csv_dest)

    # get a dictionary which maps extension names of the form "csv"
    # to lists of the full paths of files with those extensions in the
    # dataset.
    # CREATES "extension_index_<dataset_name>.npy"
    ext_locations = DFS.extension_indexer(dataset_path, num_top_exts,
                                          write_path)

    # if we have extensions with the following names, performs
    # conversion.
    '''
    comp_paths = [] 
    if "z" in ext_locations:    
        comp_paths = ext_locations.get("z")
        comp_paths.extend(ext_locations.get("gz"))
        comp_paths.extend(ext_locations.get("zip"))
        handle_compressed(comp_paths, dest)
    '''
    '''    
    if "pdf" in ext_locations:
        pdf_paths = ext_locations.get("pdf")
        convert_pdfs(pdf_paths, dest, num_processes)
    if "doc" in ext_locations:
        doc_paths = ext_locations.get("doc")
        convert_doc(doc_paths, dest, num_processes)
    if "docx" in ext_locations:
        docx_paths = ext_locations.get("docx")
        convert_docx(docx_paths, dest, num_processes)
    if "html" in ext_locations:
        html_paths = ext_locations.get("html")
        html_paths.extend(ext_locations.get("htm"))
        convert_html(html_paths, dest, num_processes)
    if "xml" in ext_locations:
        xml_paths = ext_locations.get("xml")
        convert_xml(xml_paths, dest, num_processes)
    '''

    if "xls" in ext_locations:
        xls_paths = ext_locations.get("xls")
        valid_xls = get_valid_filenames_struct(xls_paths)
        convert_tabular(valid_xls, csv_dest, num_processes)
    if "xlsx" in ext_locations:
        xlsx_paths = ext_locations.get("xlsx")
        valid_xlsx = get_valid_filenames_struct(xlsx_paths)
        convert_tabular(valid_xlsx, csv_dest, num_processes)
    if "tsv" in ext_locations:
        tsv_paths = ext_locations.get("tsv")
        valid_tsv = get_valid_filenames_struct(tsv_paths)
        convert_tsv(valid_tsv, csv_dest, num_processes)
Пример #11
0
def main():

    print("ARGUMENTS: ")
    args = load_arguments()
    print("Arguments loaded. ")

    dataset_path = args.dataset_path
    dest = os.path.join(dataset_path, "../")
    num_clusters = args.num_clusters
    num_top_exts = args.num_extensions
    num_processes = args.num_processes
    overwrite_dist = 'y'
    overwrite_plot = 'y'
    fill_threshold = 0.4

    # check if destination is valid, get its absolute path
    check_valid_dir(dest)
    dest = os.path.abspath(dest)

    # check if dataset is valid, get its absolute path
    check_valid_dir(dataset_path)
    dataset_path = os.path.abspath(dataset_path)

    # the name of the top-level directory of the dataset
    dataset_name = get_last_dir_from_path(dataset_path)

    # define the write path for the entire program
    write_path = "../../cluster-datalake-outputs/" + dataset_name + "--output/"
    if not os.path.isdir(write_path):
        os.system("mkdir " + write_path)
    print("All results printing to " + write_path)

    # get absolute path
    write_path = os.path.abspath(write_path)

    # write results to a text file
    f = open(os.path.join(write_path, 'shuffle_test_' + dataset_name + '.txt'),
             'w')
    f.write("shuffle_ratio" + "," + "freqdrop_score" + "," +
            "silhouette_score" + "," + "naive_tree_dist_score" + "," + "\n")

    #===================================================================
    #=#: Shuffle and cluster, recording the ensemble score.
    #===================================================================

    shuffle_tracker = []

    # get a list of the paths to every file in the dataset
    # rooted at "dataset_path"
    filepaths = DFS.DFS(dataset_path)

    # generate path to the new root of our test dataset
    shuffled_dataset_name = "shuffled_" + dataset_name
    shuffled_dataset_path = os.path.join(dest, shuffled_dataset_name)
    print("clustering: ", shuffled_dataset_path)

    # copy dataset to this new location
    os.system("cp -r " + dataset_path + " " + shuffled_dataset_path)

    # we gradually increase the proportion of the test dataset
    # which is shuffled
    shuffle_ratio = 0.0
    while shuffle_ratio <= 1.0:

        # define the write path for the entire program
        write_path = "../../cluster-datalake-outputs/" + shuffled_dataset_name + "--output/"

        # get converted file location and output location
        out_dir = os.path.join(shuffled_dataset_path,
                               "../" + "converted-" + shuffled_dataset_name)

        if not os.path.isdir(write_path):
            os.system("mkdir " + write_path)

        if not os.path.isdir(out_dir):
            os.system("mkdir " + out_dir)
        csv_path = os.path.join(out_dir, "csv/")
        if not os.path.isdir(csv_path):
            os.system("mkdir " + csv_path)
        txt_path = os.path.join(out_dir, "txt/")
        if not os.path.isdir(txt_path):
            os.system("mkdir " + txt_path)

        # shuffle and convert the test dataset
        shuffle_tracker = shuffle(shuffled_dataset_path, shuffle_ratio, False,
                                  shuffle_tracker, filepaths)
        DFS.extension_indexer(shuffled_dataset_path, num_top_exts, write_path)

        # cluster the shuffled test dataset
        scores = schema_clustering.runflow(shuffled_dataset_path, num_clusters,
                                           overwrite_dist, overwrite_plot,
                                           fill_threshold)

        # print results
        print("Shuffle ratio: ", shuffle_ratio, "Freqdrop score: ", scores[0],
              "Silhouette score: ", scores[1], "Naive score: ", scores[2])
        f.write(
            format(shuffle_ratio, '.3f') + "," + format(scores[0], '.3f') +
            "," + format(scores[1], '.3f') + "," + format(scores[2], '.3f') +
            "," + '\n')

        # delete the shuffled dataset, outputs, and converted files
        os.system("rm -r " + write_path)
        os.system("rm -r " + out_dir)

        shuffle_ratio += args.step

    f.close()
    return