def fetch_statements(language, force=False, do_save=False, limit=None, as_list=False): pkl_file = get_executed_stmts_pkl(language) if not force and cache.file_exists(pkl_file): LOGGER.info("Retrieving existing '%s' statements!" % language) if as_list: return cache.load_pickle(pkl_file).values() return cache.load_pickle(pkl_file) LOGGER.info("Reprocessing '%s' statements!" % language) store = mongo_driver.MongoStore(props.DATASET) stmts = {} mongo_stmts = store.load_stmts(language=language, is_valid=True, has_output=True, limit=limit).items() n_stmts = len(mongo_stmts) for i, (key, mongo_stmt) in enumerate(mongo_stmts): LOGGER.info("Processing %d / %d .... " % (i + 1, n_stmts)) stmt = Statement(mongo_id=mongo_stmt["_id"], snippet=mongo_stmt["snippet"], variables=mongo_stmt["variables"], language=language, outputs=format_outputs(mongo_stmt["outputs"])) stmts[stmt.mongo_id] = stmt if do_save: LOGGER.info("Saving statements .... ") cache.save_pickle(pkl_file, stmts) if as_list: return stmts.values() return stmts
def save_clustered_function_names(dataset, language): LOGGER.info( "Saving clustered function names for dataset '%s' and language '%s'" % (dataset, language)) cluster_path = get_cluster_path(dataset, language) clusters = cache.load_pickle(cluster_path) function_names = defaultdict(set) cloned_functions = defaultdict(dict) for label, functions in clusters.items(): for funct in functions: name, clone_name, clone_attribute = None, None, None lang = funct.source if hasattr(funct, "base_name") and funct.base_name is not None: name = funct.base_name clone_name = funct.name clone_attribute = funct.return_attribute else: name = funct.name function_names[lang].add(name) if clone_name: cloned_functions[name][clone_attribute] = clone_name execution_store = get_execution_store(dataset) for lang, names in function_names.items(): execution_store.save_language_executed_function_names( lang, list(names)) for name, clones in cloned_functions.items(): execution_store.save_cloned_function_names(name, clones)
def save_only_mixed_clusters(dataset, mixed_file_base_name): """ Save only mixed functions :param dataset: Name of dataset :param mixed_file_base_name: Type of language eg. java_python :return: """ clusters_base_folder = os.path.join(lib.get_clusters_folder(dataset), "cluster_testing") for folder in sorted(cache.list_dir(clusters_base_folder, is_absolute=False)): LOGGER.info("Processing '%s' ..." % folder) folder_path = os.path.join(clusters_base_folder, folder) base_clusters_file = os.path.join(folder_path, "%s.pkl" % mixed_file_base_name) base_clusters = cache.load_pickle(base_clusters_file) mixed_clusters = {} for label, functions in base_clusters.items(): if label == -1 or len(functions) == 1: continue sources = set() for func in functions: sources.add(func.source) if len(sources) > 1: mixed_clusters[label] = functions LOGGER.info("For folder = %s, # of mixed clusters = %d" % (folder, len(mixed_clusters))) file_path = os.path.join(folder_path, "only_mixed.txt") pkl_path = os.path.join(folder_path, "only_mixed.pkl") file_contents = [] for label, functions in mixed_clusters.items(): file_contents.append("\n\n****** Cluster %d ******" % label) for func in functions: file_contents.append(func.body) cache.write_file(file_path, "\n".join(file_contents)) cache.save_pickle(pkl_path, mixed_clusters)
def get_class_and_generate_functions(dataset, language="java_python", eps=0.01): base_file = os.path.join(properties.META_RESULTS_FOLDER, dataset, "clusters", "cluster_testing", "eps_%0.2f" % eps, "%s.pkl" % language) clusters = cache.load_pickle(base_file) packages = set() for label, functions in clusters.items(): if label == -1 or len(functions) == 1: continue for func in functions: if func.source == "java": packages.add(func.package) for package in sorted(list(packages)): print(package)
def remove_overlapping_clusters(dataset, language="java_python"): # TODO: Think about how to remove syntactic equivalence store = mongo_store.FunctionStore(dataset) base_file = os.path.join(properties.META_RESULTS_FOLDER, dataset, "clusters", "%s.pkl" % language) clusters = cache.load_pickle(base_file) non_overlapping_clusters = {} for label, functions in clusters.items(): if label == -1 or len(functions) == 1: continue non_overlapping_funcs = [] metas = {} for func in functions: meta = store.load_metadata({"name": func.base_name}) metas[func.base_name] = meta if len(non_overlapping_funcs) == 0: non_overlapping_funcs.append(func) continue is_non_overlapping_funcs_updated = False for i, existing_func in enumerate(non_overlapping_funcs): existing_meta = metas[existing_func.base_name] if overlaps(meta, existing_meta): is_non_overlapping_funcs_updated = True if is_more_succinct(meta, existing_meta): non_overlapping_funcs[i] = func break if not is_non_overlapping_funcs_updated: non_overlapping_funcs.append(func) if len(non_overlapping_funcs) > 1: non_overlapping_clusters[label] = non_overlapping_funcs write_folder = os.path.join(properties.META_RESULTS_FOLDER, dataset, "clusters", "non_overlapping") cache.mkdir(write_folder) clusters_txt_file = os.path.join(write_folder, "%s.txt" % language) clusters_pkl_file = os.path.join(write_folder, "%s.pkl" % language) clusters_report_file = os.path.join(write_folder, "%s.md" % language) cache.save_pickle(clusters_pkl_file, non_overlapping_clusters) clusterer.save_clusters_to_db(dataset, non_overlapping_clusters, "non_overlapping") clusterer.save_clusters_to_txt(non_overlapping_clusters, clusters_txt_file) sizes = [ len(cluster_funcs) for label, cluster_funcs in non_overlapping_clusters.items() if label != -1 ] meta_data = "## Cluster sizes\n" meta_data += "* Number of clusters: %d\n" % len(non_overlapping_clusters) meta_data += "* Number of functions clustered: %d\n" % sum(sizes) meta_data += "## REPORT\n" meta_data += Stat(sizes).report() cache.write_file(clusters_report_file, meta_data)
def load_functions(functions_path, source): functions_dict = cache.load_pickle(functions_path) functions = [] for func_name, func_dict in functions_dict.items(): outputs = Outputs(func_dict["outputs"]) funct = Function(name=func_name, input_key=func_dict["inputKey"], outputs=outputs, body=get_body(func_dict), source=source) functions.append(funct) valid_functions = { funct.name: funct for funct in functions if is_useful_function(funct) } LOGGER.info("Valid Functions : %d / %d" % (len(valid_functions), len(functions))) return valid_functions
def save_only_target_functions(dataset, mixed_file_base_name, target_language): """ Save only java functions from a mixture of java and python clusters :param dataset: Name of dataset :param mixed_file_base_name: Type of language eg. java_python :param target_language: Target Language :return: """ clusters_base_folder = os.path.join(lib.get_clusters_folder(dataset), "cluster_testing") for folder in sorted( cache.list_dir(clusters_base_folder, is_absolute=False)): LOGGER.info("Processing '%s' ..." % folder) folder_path = os.path.join(clusters_base_folder, folder) cache.mkdir(folder_path) base_clusters_file = os.path.join(folder_path, "%s.pkl" % mixed_file_base_name) base_clusters = cache.load_pickle(base_clusters_file) target_clusters = {} for label, functions in base_clusters.items(): if label == -1 or len(functions) == 1: continue contains_target = False contains_other = False for func in functions: if func.source == target_language: contains_target = True else: contains_other = True if contains_target and not contains_other: target_clusters[label] = functions LOGGER.info("For folder = %s, # of '%s' clusters = %d" % (folder, target_language, len(target_clusters))) file_path = os.path.join(folder_path, "only_%s.txt" % target_language) pkl_path = os.path.join(folder_path, "only_%s.pkl" % target_language) file_contents = [] for label, functions in target_clusters.items(): file_contents.append("\n\n****** Cluster %d ******" % label) for func in functions: file_contents.append(func.body) cache.write_file(file_path, "\n".join(file_contents)) cache.save_pickle(pkl_path, target_clusters)
def cluster_source(dataset, language="java_python"): LOGGER.info("Loading pickle ...") cluster_path = get_cluster_path(dataset, language) clusters = cache.load_pickle(cluster_path) cluster_type_counts = defaultdict(int) for label, functions in clusters.items(): if label == -1: continue contains_java = False contains_python = False for func in functions: if func.source == "java": contains_java = True elif func.source == "python": contains_python = True if contains_python and contains_java: cluster_type_counts["mixed"] += 1 elif contains_java: cluster_type_counts["java"] += 1 elif contains_python: cluster_type_counts["python"] += 1 print(cluster_type_counts)
def load_clusters_from_pkl(file_name): return cache.load_pickle(file_name)