def random_testing(dataset, language="java_python", n_folds=10): LOGGER.info("Random testing with '%d' number of folds" % n_folds) folds = get_cross_val(dataset, n_folds, as_dict=True) base_folder = lib.get_clusters_folder(dataset) LOGGER.info("Loading pickle ...") cluster_path = get_cluster_path(dataset, language) clusters = cache.load_pickle(cluster_path) for index, fold in enumerate(folds): file_name = os.path.join(base_folder, "random_testing", "fold_%d" % index, "distances.pkl") cluster_distances = {} for label, functions in clusters.items(): if label == -1: continue similarity_map = defaultdict(dict) for i in range(len(functions) - 1): for j in range(i + 1, len(functions)): assert i != j f_i, f_j = functions[i], functions[j] distance = clusterer.execution_distance( fold[f_i.name], fold[f_j.name]) similarity_map[f_i.name][f_j.name] = distance similarity_map[f_j.name][f_i.name] = distance cluster_distances[label] = similarity_map cache.save_pickle(file_name, cluster_distances)
def compute_similarity(dataset, language=None, functions=None, base_folder=None, file_name=None, skip_singles=False, update_clone_meta=False, clustering_error=0.01, cluster_suffix="base"): if not functions: if language == "java": functions = load_functions(dataset, update_clone_meta=update_clone_meta) elif language == "python": functions = load_py_functions(dataset) elif language == "java_python": functions = load_functions(dataset, update_clone_meta=update_clone_meta) + load_py_functions(dataset) else: raise RuntimeError("Invalid language: %s" % language) # if dataset not in ["codejam", "introclass"]: # raise RuntimeError("Invalid dataset: %s" % dataset) LOGGER.info("Clustering ... ") if file_name is None: file_name = language or "clusters" LOGGER.warning("A @file_name is not provided. Reverting file name to '%s'" % file_name) if base_folder is None: base_folder = lib.get_clusters_folder(dataset) clusters_txt_file = os.path.join(base_folder, "%s.txt" % file_name) clusters_pkl_file = os.path.join(base_folder, "%s.pkl" % file_name) clusters_report_file = os.path.join(base_folder, "%s.md" % file_name) clusters = get_clusterer()(functions).cluster(clusters_txt_file, skip_singles=skip_singles, clustering_error=clustering_error) cache.save_pickle(clusters_pkl_file, clusters) clusterer.save_clusters_to_db(dataset, clusters, cluster_suffix) n_clusters = len(clusters) sizes = [len(cluster_funcs) for label, cluster_funcs in clusters.items() if label != -1] meta_data = "## Cluster sizes\n" meta_data += "* Number of clusters: %d\n" % n_clusters meta_data += "* Number of functions clustered: %d\n" % sum(sizes) meta_data += "* Number of functions not clustered: %d\n\n" % (len(functions) - sum(sizes)) meta_data += "## REPORT\n" meta_data += Stat(sizes).report() cache.write_file(clusters_report_file, meta_data)
def save_only_mixed_clusters(dataset, mixed_file_base_name): """ Save only mixed functions :param dataset: Name of dataset :param mixed_file_base_name: Type of language eg. java_python :return: """ clusters_base_folder = os.path.join(lib.get_clusters_folder(dataset), "cluster_testing") for folder in sorted(cache.list_dir(clusters_base_folder, is_absolute=False)): LOGGER.info("Processing '%s' ..." % folder) folder_path = os.path.join(clusters_base_folder, folder) base_clusters_file = os.path.join(folder_path, "%s.pkl" % mixed_file_base_name) base_clusters = cache.load_pickle(base_clusters_file) mixed_clusters = {} for label, functions in base_clusters.items(): if label == -1 or len(functions) == 1: continue sources = set() for func in functions: sources.add(func.source) if len(sources) > 1: mixed_clusters[label] = functions LOGGER.info("For folder = %s, # of mixed clusters = %d" % (folder, len(mixed_clusters))) file_path = os.path.join(folder_path, "only_mixed.txt") pkl_path = os.path.join(folder_path, "only_mixed.pkl") file_contents = [] for label, functions in mixed_clusters.items(): file_contents.append("\n\n****** Cluster %d ******" % label) for func in functions: file_contents.append(func.body) cache.write_file(file_path, "\n".join(file_contents)) cache.save_pickle(pkl_path, mixed_clusters)
def cluster_testing(dataset, language="java_python"): LOGGER.info("Testing different cluster sizes for dataset '%s' and language '%s'" % (dataset, language)) functions = similarity.load_functions(dataset) + similarity.load_py_functions(dataset) errors = [0.01, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30] base_folder = os.path.join(lib.get_clusters_folder(dataset), "cluster_testing") for clustering_error in errors: result_folder = os.path.join(base_folder, "eps_%0.2f" % clustering_error) similarity.compute_similarity(dataset, language, functions=functions, base_folder=result_folder, clustering_error=clustering_error)
def save_only_target_functions(dataset, mixed_file_base_name, target_language): """ Save only java functions from a mixture of java and python clusters :param dataset: Name of dataset :param mixed_file_base_name: Type of language eg. java_python :param target_language: Target Language :return: """ clusters_base_folder = os.path.join(lib.get_clusters_folder(dataset), "cluster_testing") for folder in sorted( cache.list_dir(clusters_base_folder, is_absolute=False)): LOGGER.info("Processing '%s' ..." % folder) folder_path = os.path.join(clusters_base_folder, folder) cache.mkdir(folder_path) base_clusters_file = os.path.join(folder_path, "%s.pkl" % mixed_file_base_name) base_clusters = cache.load_pickle(base_clusters_file) target_clusters = {} for label, functions in base_clusters.items(): if label == -1 or len(functions) == 1: continue contains_target = False contains_other = False for func in functions: if func.source == target_language: contains_target = True else: contains_other = True if contains_target and not contains_other: target_clusters[label] = functions LOGGER.info("For folder = %s, # of '%s' clusters = %d" % (folder, target_language, len(target_clusters))) file_path = os.path.join(folder_path, "only_%s.txt" % target_language) pkl_path = os.path.join(folder_path, "only_%s.pkl" % target_language) file_contents = [] for label, functions in target_clusters.items(): file_contents.append("\n\n****** Cluster %d ******" % label) for func in functions: file_contents.append(func.body) cache.write_file(file_path, "\n".join(file_contents)) cache.save_pickle(pkl_path, target_clusters)
def get_cluster_path(dataset, language): return os.path.join(lib.get_clusters_folder(dataset), "%s.pkl" % language)
def _transfer_clusters(dataset): file_name = os.path.join(lib.get_clusters_folder(dataset), "java.pkl") clusters = load_clusters_from_pkl(file_name) save_clusters_to_db(dataset, clusters, "base")