def random_testing(dataset, language="java_python", n_folds=10): LOGGER.info("Random testing with '%d' number of folds" % n_folds) folds = get_cross_val(dataset, n_folds, as_dict=True) base_folder = lib.get_clusters_folder(dataset) LOGGER.info("Loading pickle ...") cluster_path = get_cluster_path(dataset, language) clusters = cache.load_pickle(cluster_path) for index, fold in enumerate(folds): file_name = os.path.join(base_folder, "random_testing", "fold_%d" % index, "distances.pkl") cluster_distances = {} for label, functions in clusters.items(): if label == -1: continue similarity_map = defaultdict(dict) for i in range(len(functions) - 1): for j in range(i + 1, len(functions)): assert i != j f_i, f_j = functions[i], functions[j] distance = clusterer.execution_distance( fold[f_i.name], fold[f_j.name]) similarity_map[f_i.name][f_j.name] = distance similarity_map[f_j.name][f_i.name] = distance cluster_distances[label] = similarity_map cache.save_pickle(file_name, cluster_distances)
def compute_similarity(dataset, language=None, functions=None, base_folder=None, file_name=None, skip_singles=False, update_clone_meta=False, clustering_error=0.01, cluster_suffix="base"): if not functions: if language == "java": functions = load_functions(dataset, update_clone_meta=update_clone_meta) elif language == "python": functions = load_py_functions(dataset) elif language == "java_python": functions = load_functions(dataset, update_clone_meta=update_clone_meta) + load_py_functions(dataset) else: raise RuntimeError("Invalid language: %s" % language) # if dataset not in ["codejam", "introclass"]: # raise RuntimeError("Invalid dataset: %s" % dataset) LOGGER.info("Clustering ... ") if file_name is None: file_name = language or "clusters" LOGGER.warning("A @file_name is not provided. Reverting file name to '%s'" % file_name) if base_folder is None: base_folder = lib.get_clusters_folder(dataset) clusters_txt_file = os.path.join(base_folder, "%s.txt" % file_name) clusters_pkl_file = os.path.join(base_folder, "%s.pkl" % file_name) clusters_report_file = os.path.join(base_folder, "%s.md" % file_name) clusters = get_clusterer()(functions).cluster(clusters_txt_file, skip_singles=skip_singles, clustering_error=clustering_error) cache.save_pickle(clusters_pkl_file, clusters) clusterer.save_clusters_to_db(dataset, clusters, cluster_suffix) n_clusters = len(clusters) sizes = [len(cluster_funcs) for label, cluster_funcs in clusters.items() if label != -1] meta_data = "## Cluster sizes\n" meta_data += "* Number of clusters: %d\n" % n_clusters meta_data += "* Number of functions clustered: %d\n" % sum(sizes) meta_data += "* Number of functions not clustered: %d\n\n" % (len(functions) - sum(sizes)) meta_data += "## REPORT\n" meta_data += Stat(sizes).report() cache.write_file(clusters_report_file, meta_data)
def save_only_mixed_clusters(dataset, mixed_file_base_name): """ Save only mixed functions :param dataset: Name of dataset :param mixed_file_base_name: Type of language eg. java_python :return: """ clusters_base_folder = os.path.join(lib.get_clusters_folder(dataset), "cluster_testing") for folder in sorted(cache.list_dir(clusters_base_folder, is_absolute=False)): LOGGER.info("Processing '%s' ..." % folder) folder_path = os.path.join(clusters_base_folder, folder) base_clusters_file = os.path.join(folder_path, "%s.pkl" % mixed_file_base_name) base_clusters = cache.load_pickle(base_clusters_file) mixed_clusters = {} for label, functions in base_clusters.items(): if label == -1 or len(functions) == 1: continue sources = set() for func in functions: sources.add(func.source) if len(sources) > 1: mixed_clusters[label] = functions LOGGER.info("For folder = %s, # of mixed clusters = %d" % (folder, len(mixed_clusters))) file_path = os.path.join(folder_path, "only_mixed.txt") pkl_path = os.path.join(folder_path, "only_mixed.pkl") file_contents = [] for label, functions in mixed_clusters.items(): file_contents.append("\n\n****** Cluster %d ******" % label) for func in functions: file_contents.append(func.body) cache.write_file(file_path, "\n".join(file_contents)) cache.save_pickle(pkl_path, mixed_clusters)
def fetch_statements(language, force=False, do_save=False, limit=None, as_list=False): pkl_file = get_executed_stmts_pkl(language) if not force and cache.file_exists(pkl_file): LOGGER.info("Retrieving existing '%s' statements!" % language) if as_list: return cache.load_pickle(pkl_file).values() return cache.load_pickle(pkl_file) LOGGER.info("Reprocessing '%s' statements!" % language) store = mongo_driver.MongoStore(props.DATASET) stmts = {} mongo_stmts = store.load_stmts(language=language, is_valid=True, has_output=True, limit=limit).items() n_stmts = len(mongo_stmts) for i, (key, mongo_stmt) in enumerate(mongo_stmts): LOGGER.info("Processing %d / %d .... " % (i + 1, n_stmts)) stmt = Statement(mongo_id=mongo_stmt["_id"], snippet=mongo_stmt["snippet"], variables=mongo_stmt["variables"], language=language, outputs=format_outputs(mongo_stmt["outputs"])) stmts[stmt.mongo_id] = stmt if do_save: LOGGER.info("Saving statements .... ") cache.save_pickle(pkl_file, stmts) if as_list: return stmts.values() return stmts
def cluster(clustering_error): functions = load_functions(PD_FUNCTIONS_PATH, "pandas") functions.update(load_functions(R_FUNCTIONS_PATH, "R")) file_name = "clusters" folder = os.path.join(BASE_CLUSTER_FOLDER, "%0.02f" % clustering_error) cache.mkdir(folder) clusters_txt_file = os.path.join(folder, "%s.txt" % file_name) clusters_pkl_file = os.path.join(folder, "%s.pkl" % file_name) clusters_report_file = os.path.join(folder, "%s.md" % file_name) clusterer = RepresentativeClusterer(functions.values(), distance_function=execution_distance) clusters = clusterer.cluster(clusters_txt_file, skip_singles=True, clustering_error=clustering_error) cache.save_pickle(clusters_pkl_file, clusters) n_clusters = len(clusters) sizes = [ len(cluster_funcs) for label, cluster_funcs in clusters.items() if label != -1 ] meta_data = "## Cluster sizes\n" meta_data += "* Number of clusters: %d\n" % n_clusters meta_data += "* Number of functions clustered: %d\n" % sum(sizes) meta_data += "* Number of functions not clustered: %d\n\n" % ( len(functions) - sum(sizes)) meta_data += "## REPORT\n" meta_data += stat.Stat(sizes).report() cache.write_file(clusters_report_file, meta_data)
def remove_overlapping_clusters(dataset, language="java_python"): # TODO: Think about how to remove syntactic equivalence store = mongo_store.FunctionStore(dataset) base_file = os.path.join(properties.META_RESULTS_FOLDER, dataset, "clusters", "%s.pkl" % language) clusters = cache.load_pickle(base_file) non_overlapping_clusters = {} for label, functions in clusters.items(): if label == -1 or len(functions) == 1: continue non_overlapping_funcs = [] metas = {} for func in functions: meta = store.load_metadata({"name": func.base_name}) metas[func.base_name] = meta if len(non_overlapping_funcs) == 0: non_overlapping_funcs.append(func) continue is_non_overlapping_funcs_updated = False for i, existing_func in enumerate(non_overlapping_funcs): existing_meta = metas[existing_func.base_name] if overlaps(meta, existing_meta): is_non_overlapping_funcs_updated = True if is_more_succinct(meta, existing_meta): non_overlapping_funcs[i] = func break if not is_non_overlapping_funcs_updated: non_overlapping_funcs.append(func) if len(non_overlapping_funcs) > 1: non_overlapping_clusters[label] = non_overlapping_funcs write_folder = os.path.join(properties.META_RESULTS_FOLDER, dataset, "clusters", "non_overlapping") cache.mkdir(write_folder) clusters_txt_file = os.path.join(write_folder, "%s.txt" % language) clusters_pkl_file = os.path.join(write_folder, "%s.pkl" % language) clusters_report_file = os.path.join(write_folder, "%s.md" % language) cache.save_pickle(clusters_pkl_file, non_overlapping_clusters) clusterer.save_clusters_to_db(dataset, non_overlapping_clusters, "non_overlapping") clusterer.save_clusters_to_txt(non_overlapping_clusters, clusters_txt_file) sizes = [ len(cluster_funcs) for label, cluster_funcs in non_overlapping_clusters.items() if label != -1 ] meta_data = "## Cluster sizes\n" meta_data += "* Number of clusters: %d\n" % len(non_overlapping_clusters) meta_data += "* Number of functions clustered: %d\n" % sum(sizes) meta_data += "## REPORT\n" meta_data += Stat(sizes).report() cache.write_file(clusters_report_file, meta_data)
def save_only_target_functions(dataset, mixed_file_base_name, target_language): """ Save only java functions from a mixture of java and python clusters :param dataset: Name of dataset :param mixed_file_base_name: Type of language eg. java_python :param target_language: Target Language :return: """ clusters_base_folder = os.path.join(lib.get_clusters_folder(dataset), "cluster_testing") for folder in sorted( cache.list_dir(clusters_base_folder, is_absolute=False)): LOGGER.info("Processing '%s' ..." % folder) folder_path = os.path.join(clusters_base_folder, folder) cache.mkdir(folder_path) base_clusters_file = os.path.join(folder_path, "%s.pkl" % mixed_file_base_name) base_clusters = cache.load_pickle(base_clusters_file) target_clusters = {} for label, functions in base_clusters.items(): if label == -1 or len(functions) == 1: continue contains_target = False contains_other = False for func in functions: if func.source == target_language: contains_target = True else: contains_other = True if contains_target and not contains_other: target_clusters[label] = functions LOGGER.info("For folder = %s, # of '%s' clusters = %d" % (folder, target_language, len(target_clusters))) file_path = os.path.join(folder_path, "only_%s.txt" % target_language) pkl_path = os.path.join(folder_path, "only_%s.pkl" % target_language) file_contents = [] for label, functions in target_clusters.items(): file_contents.append("\n\n****** Cluster %d ******" % label) for func in functions: file_contents.append(func.body) cache.write_file(file_path, "\n".join(file_contents)) cache.save_pickle(pkl_path, target_clusters)
def save_function(func_data): saved_funcs = cache.load_pickle(FUNCTION_STORE) if not saved_funcs: saved_funcs = {} saved_funcs[func_data["name"]] = func_data cache.save_pickle(FUNCTION_STORE, saved_funcs)
def store_args(key, args): data_store = cache.load_pickle(STORE_PATH) if not data_store: data_store = {} data_store[key] = args cache.save_pickle(STORE_PATH, data_store)