def cluster(clustering_error): functions = load_functions(PD_FUNCTIONS_PATH, "pandas") functions.update(load_functions(R_FUNCTIONS_PATH, "R")) file_name = "clusters" folder = os.path.join(BASE_CLUSTER_FOLDER, "%0.02f" % clustering_error) cache.mkdir(folder) clusters_txt_file = os.path.join(folder, "%s.txt" % file_name) clusters_pkl_file = os.path.join(folder, "%s.pkl" % file_name) clusters_report_file = os.path.join(folder, "%s.md" % file_name) clusterer = RepresentativeClusterer(functions.values(), distance_function=execution_distance) clusters = clusterer.cluster(clusters_txt_file, skip_singles=True, clustering_error=clustering_error) cache.save_pickle(clusters_pkl_file, clusters) n_clusters = len(clusters) sizes = [ len(cluster_funcs) for label, cluster_funcs in clusters.items() if label != -1 ] meta_data = "## Cluster sizes\n" meta_data += "* Number of clusters: %d\n" % n_clusters meta_data += "* Number of functions clustered: %d\n" % sum(sizes) meta_data += "* Number of functions not clustered: %d\n\n" % ( len(functions) - sum(sizes)) meta_data += "## REPORT\n" meta_data += stat.Stat(sizes).report() cache.write_file(clusters_report_file, meta_data)
def export_runner_ast(xl_path): cache.mkdir(props.EXPORT_HOME) writer = pd.ExcelWriter(os.path.join(props.EXPORT_HOME, xl_path), engine='xlsxwriter') export_similar_differences(0.9, -9, writer, "HighSim-HighSyn", "d_ast") export_similar_differences(0.9, 22, writer, "HighSim-LowSyn", "d_ast") export_similar_differences(-0.1, -9, writer, "LowSim-HighSyn", "d_ast") export_similar_differences(-0.1, 22, writer, "LowSim-LowSyn", "d_ast") writer.save() writer.close()
def remove_overlapping_clusters(dataset, language="java_python"): # TODO: Think about how to remove syntactic equivalence store = mongo_store.FunctionStore(dataset) base_file = os.path.join(properties.META_RESULTS_FOLDER, dataset, "clusters", "%s.pkl" % language) clusters = cache.load_pickle(base_file) non_overlapping_clusters = {} for label, functions in clusters.items(): if label == -1 or len(functions) == 1: continue non_overlapping_funcs = [] metas = {} for func in functions: meta = store.load_metadata({"name": func.base_name}) metas[func.base_name] = meta if len(non_overlapping_funcs) == 0: non_overlapping_funcs.append(func) continue is_non_overlapping_funcs_updated = False for i, existing_func in enumerate(non_overlapping_funcs): existing_meta = metas[existing_func.base_name] if overlaps(meta, existing_meta): is_non_overlapping_funcs_updated = True if is_more_succinct(meta, existing_meta): non_overlapping_funcs[i] = func break if not is_non_overlapping_funcs_updated: non_overlapping_funcs.append(func) if len(non_overlapping_funcs) > 1: non_overlapping_clusters[label] = non_overlapping_funcs write_folder = os.path.join(properties.META_RESULTS_FOLDER, dataset, "clusters", "non_overlapping") cache.mkdir(write_folder) clusters_txt_file = os.path.join(write_folder, "%s.txt" % language) clusters_pkl_file = os.path.join(write_folder, "%s.pkl" % language) clusters_report_file = os.path.join(write_folder, "%s.md" % language) cache.save_pickle(clusters_pkl_file, non_overlapping_clusters) clusterer.save_clusters_to_db(dataset, non_overlapping_clusters, "non_overlapping") clusterer.save_clusters_to_txt(non_overlapping_clusters, clusters_txt_file) sizes = [ len(cluster_funcs) for label, cluster_funcs in non_overlapping_clusters.items() if label != -1 ] meta_data = "## Cluster sizes\n" meta_data += "* Number of clusters: %d\n" % len(non_overlapping_clusters) meta_data += "* Number of functions clustered: %d\n" % sum(sizes) meta_data += "## REPORT\n" meta_data += Stat(sizes).report() cache.write_file(clusters_report_file, meta_data)
def save_only_target_functions(dataset, mixed_file_base_name, target_language): """ Save only java functions from a mixture of java and python clusters :param dataset: Name of dataset :param mixed_file_base_name: Type of language eg. java_python :param target_language: Target Language :return: """ clusters_base_folder = os.path.join(lib.get_clusters_folder(dataset), "cluster_testing") for folder in sorted( cache.list_dir(clusters_base_folder, is_absolute=False)): LOGGER.info("Processing '%s' ..." % folder) folder_path = os.path.join(clusters_base_folder, folder) cache.mkdir(folder_path) base_clusters_file = os.path.join(folder_path, "%s.pkl" % mixed_file_base_name) base_clusters = cache.load_pickle(base_clusters_file) target_clusters = {} for label, functions in base_clusters.items(): if label == -1 or len(functions) == 1: continue contains_target = False contains_other = False for func in functions: if func.source == target_language: contains_target = True else: contains_other = True if contains_target and not contains_other: target_clusters[label] = functions LOGGER.info("For folder = %s, # of '%s' clusters = %d" % (folder, target_language, len(target_clusters))) file_path = os.path.join(folder_path, "only_%s.txt" % target_language) pkl_path = os.path.join(folder_path, "only_%s.pkl" % target_language) file_contents = [] for label, functions in target_clusters.items(): file_contents.append("\n\n****** Cluster %d ******" % label) for func in functions: file_contents.append(func.body) cache.write_file(file_path, "\n".join(file_contents)) cache.save_pickle(pkl_path, target_clusters)
def save_only_mixed_clusters(dataset, mixed_file_base_name): """ Save only mixed functions :param dataset: Name of dataset :param mixed_file_base_name: Type of language eg. java_python :return: """ clusters_base_folder = os.path.join(lib.get_clusters_folder(dataset), "cluster_testing") for folder in sorted( cache.list_dir(clusters_base_folder, is_absolute=False)): LOGGER.info("Processing '%s' ..." % folder) folder_path = os.path.join(clusters_base_folder, folder) cache.mkdir(folder_path) base_clusters_file = os.path.join(folder_path, "%s.pkl" % mixed_file_base_name) base_clusters = cache.load_pickle(base_clusters_file) mixed_clusters = {} for label, functions in base_clusters.items(): if label == -1 or len(functions) == 1: continue sources = set() for func in functions: sources.add(func.source) if len(sources) > 1: mixed_clusters[label] = functions LOGGER.info("For folder = %s, # of mixed clusters = %d" % (folder, len(mixed_clusters))) file_path = os.path.join(folder_path, "only_mixed.txt") pkl_path = os.path.join(folder_path, "only_mixed.pkl") file_contents = [] for label, functions in mixed_clusters.items(): file_contents.append("\n\n****** Cluster %d ******" % label) for func in functions: file_contents.append(func.body) cache.write_file(file_path, "\n".join(file_contents)) cache.save_pickle(pkl_path, mixed_clusters)