def cluster_testing(dataset, language="java_python"): LOGGER.info("Testing different cluster sizes for dataset '%s' and language '%s'" % (dataset, language)) functions = similarity.load_functions(dataset) + similarity.load_py_functions(dataset) errors = [0.01, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30] base_folder = os.path.join(lib.get_clusters_folder(dataset), "cluster_testing") for clustering_error in errors: result_folder = os.path.join(base_folder, "eps_%0.2f" % clustering_error) similarity.compute_similarity(dataset, language, functions=functions, base_folder=result_folder, clustering_error=clustering_error)
def get_cross_val(dataset, n_folds, as_dict=False): functions = similarity.load_functions(dataset, is_test=True) + similarity.load_py_functions(dataset, is_test=True) all_outputs = [func.outputs for func in functions] folds = [] fold_size = len(all_outputs[0].returns) // n_folds for i in range(n_folds): fold = {} if as_dict else [] start, end = i * fold_size, (i + 1) * fold_size for func in functions: clone = func.deep_clone() clone.outputs = all_outputs[i].subset(start, end) if as_dict: fold[clone.name] = clone else: fold.append(clone) folds.append(fold) return folds