Пример #1
0
def fetch_statements(language,
                     force=False,
                     do_save=False,
                     limit=None,
                     as_list=False):
    pkl_file = get_executed_stmts_pkl(language)
    if not force and cache.file_exists(pkl_file):
        LOGGER.info("Retrieving existing '%s' statements!" % language)
        if as_list:
            return cache.load_pickle(pkl_file).values()
        return cache.load_pickle(pkl_file)
    LOGGER.info("Reprocessing '%s' statements!" % language)
    store = mongo_driver.MongoStore(props.DATASET)
    stmts = {}
    mongo_stmts = store.load_stmts(language=language,
                                   is_valid=True,
                                   has_output=True,
                                   limit=limit).items()
    n_stmts = len(mongo_stmts)
    for i, (key, mongo_stmt) in enumerate(mongo_stmts):
        LOGGER.info("Processing %d / %d .... " % (i + 1, n_stmts))
        stmt = Statement(mongo_id=mongo_stmt["_id"],
                         snippet=mongo_stmt["snippet"],
                         variables=mongo_stmt["variables"],
                         language=language,
                         outputs=format_outputs(mongo_stmt["outputs"]))
        stmts[stmt.mongo_id] = stmt
    if do_save:
        LOGGER.info("Saving statements .... ")
        cache.save_pickle(pkl_file, stmts)
    if as_list:
        return stmts.values()
    return stmts
Пример #2
0
def save_clustered_function_names(dataset, language):
    LOGGER.info(
        "Saving clustered function names for dataset '%s' and language '%s'" %
        (dataset, language))
    cluster_path = get_cluster_path(dataset, language)
    clusters = cache.load_pickle(cluster_path)
    function_names = defaultdict(set)
    cloned_functions = defaultdict(dict)
    for label, functions in clusters.items():
        for funct in functions:
            name, clone_name, clone_attribute = None, None, None
            lang = funct.source
            if hasattr(funct, "base_name") and funct.base_name is not None:
                name = funct.base_name
                clone_name = funct.name
                clone_attribute = funct.return_attribute
            else:
                name = funct.name
            function_names[lang].add(name)
            if clone_name:
                cloned_functions[name][clone_attribute] = clone_name
    execution_store = get_execution_store(dataset)
    for lang, names in function_names.items():
        execution_store.save_language_executed_function_names(
            lang, list(names))
    for name, clones in cloned_functions.items():
        execution_store.save_cloned_function_names(name, clones)
Пример #3
0
def save_only_mixed_clusters(dataset, mixed_file_base_name):
  """
  Save only mixed functions
  :param dataset: Name of dataset
  :param mixed_file_base_name: Type of language eg. java_python
  :return:
  """
  clusters_base_folder = os.path.join(lib.get_clusters_folder(dataset), "cluster_testing")
  for folder in sorted(cache.list_dir(clusters_base_folder, is_absolute=False)):
    LOGGER.info("Processing '%s' ..." % folder)
    folder_path = os.path.join(clusters_base_folder, folder)
    base_clusters_file = os.path.join(folder_path, "%s.pkl" % mixed_file_base_name)
    base_clusters = cache.load_pickle(base_clusters_file)
    mixed_clusters = {}
    for label, functions in base_clusters.items():
      if label == -1 or len(functions) == 1: continue
      sources = set()
      for func in functions:
        sources.add(func.source)
      if len(sources) > 1:
        mixed_clusters[label] = functions
    LOGGER.info("For folder = %s, # of mixed clusters = %d" % (folder, len(mixed_clusters)))
    file_path = os.path.join(folder_path, "only_mixed.txt")
    pkl_path = os.path.join(folder_path, "only_mixed.pkl")
    file_contents = []
    for label, functions in mixed_clusters.items():
      file_contents.append("\n\n****** Cluster %d ******" % label)
      for func in functions:
        file_contents.append(func.body)
    cache.write_file(file_path, "\n".join(file_contents))
    cache.save_pickle(pkl_path, mixed_clusters)
Пример #4
0
def get_class_and_generate_functions(dataset, language="java_python", eps=0.01):
  base_file = os.path.join(properties.META_RESULTS_FOLDER, dataset, "clusters", "cluster_testing",
                           "eps_%0.2f" % eps, "%s.pkl" % language)
  clusters = cache.load_pickle(base_file)
  packages = set()
  for label, functions in clusters.items():
    if label == -1 or len(functions) == 1: continue
    for func in functions:
      if func.source == "java":
        packages.add(func.package)
  for package in sorted(list(packages)):
    print(package)
Пример #5
0
def remove_overlapping_clusters(dataset, language="java_python"):
    # TODO: Think about how to remove syntactic equivalence
    store = mongo_store.FunctionStore(dataset)
    base_file = os.path.join(properties.META_RESULTS_FOLDER, dataset,
                             "clusters", "%s.pkl" % language)
    clusters = cache.load_pickle(base_file)
    non_overlapping_clusters = {}
    for label, functions in clusters.items():
        if label == -1 or len(functions) == 1: continue
        non_overlapping_funcs = []
        metas = {}
        for func in functions:
            meta = store.load_metadata({"name": func.base_name})
            metas[func.base_name] = meta
            if len(non_overlapping_funcs) == 0:
                non_overlapping_funcs.append(func)
                continue
            is_non_overlapping_funcs_updated = False
            for i, existing_func in enumerate(non_overlapping_funcs):
                existing_meta = metas[existing_func.base_name]
                if overlaps(meta, existing_meta):
                    is_non_overlapping_funcs_updated = True
                    if is_more_succinct(meta, existing_meta):
                        non_overlapping_funcs[i] = func
                    break
            if not is_non_overlapping_funcs_updated:
                non_overlapping_funcs.append(func)
        if len(non_overlapping_funcs) > 1:
            non_overlapping_clusters[label] = non_overlapping_funcs
    write_folder = os.path.join(properties.META_RESULTS_FOLDER, dataset,
                                "clusters", "non_overlapping")
    cache.mkdir(write_folder)
    clusters_txt_file = os.path.join(write_folder, "%s.txt" % language)
    clusters_pkl_file = os.path.join(write_folder, "%s.pkl" % language)
    clusters_report_file = os.path.join(write_folder, "%s.md" % language)
    cache.save_pickle(clusters_pkl_file, non_overlapping_clusters)
    clusterer.save_clusters_to_db(dataset, non_overlapping_clusters,
                                  "non_overlapping")
    clusterer.save_clusters_to_txt(non_overlapping_clusters, clusters_txt_file)
    sizes = [
        len(cluster_funcs)
        for label, cluster_funcs in non_overlapping_clusters.items()
        if label != -1
    ]
    meta_data = "## Cluster sizes\n"
    meta_data += "* Number of clusters: %d\n" % len(non_overlapping_clusters)
    meta_data += "* Number of functions clustered: %d\n" % sum(sizes)
    meta_data += "## REPORT\n"
    meta_data += Stat(sizes).report()
    cache.write_file(clusters_report_file, meta_data)
Пример #6
0
def load_functions(functions_path, source):
    functions_dict = cache.load_pickle(functions_path)
    functions = []
    for func_name, func_dict in functions_dict.items():
        outputs = Outputs(func_dict["outputs"])
        funct = Function(name=func_name,
                         input_key=func_dict["inputKey"],
                         outputs=outputs,
                         body=get_body(func_dict),
                         source=source)
        functions.append(funct)
    valid_functions = {
        funct.name: funct
        for funct in functions if is_useful_function(funct)
    }
    LOGGER.info("Valid Functions : %d / %d" %
                (len(valid_functions), len(functions)))
    return valid_functions
Пример #7
0
def save_only_target_functions(dataset, mixed_file_base_name, target_language):
    """
  Save only java functions from a mixture of java and python clusters
  :param dataset: Name of dataset
  :param mixed_file_base_name: Type of language eg. java_python
  :param target_language: Target Language
  :return:
  """
    clusters_base_folder = os.path.join(lib.get_clusters_folder(dataset),
                                        "cluster_testing")
    for folder in sorted(
            cache.list_dir(clusters_base_folder, is_absolute=False)):
        LOGGER.info("Processing '%s' ..." % folder)
        folder_path = os.path.join(clusters_base_folder, folder)
        cache.mkdir(folder_path)
        base_clusters_file = os.path.join(folder_path,
                                          "%s.pkl" % mixed_file_base_name)
        base_clusters = cache.load_pickle(base_clusters_file)
        target_clusters = {}
        for label, functions in base_clusters.items():
            if label == -1 or len(functions) == 1: continue
            contains_target = False
            contains_other = False
            for func in functions:
                if func.source == target_language:
                    contains_target = True
                else:
                    contains_other = True
            if contains_target and not contains_other:
                target_clusters[label] = functions
        LOGGER.info("For folder = %s, # of '%s' clusters = %d" %
                    (folder, target_language, len(target_clusters)))
        file_path = os.path.join(folder_path, "only_%s.txt" % target_language)
        pkl_path = os.path.join(folder_path, "only_%s.pkl" % target_language)
        file_contents = []
        for label, functions in target_clusters.items():
            file_contents.append("\n\n****** Cluster %d ******" % label)
            for func in functions:
                file_contents.append(func.body)
        cache.write_file(file_path, "\n".join(file_contents))
        cache.save_pickle(pkl_path, target_clusters)
Пример #8
0
def cluster_source(dataset, language="java_python"):
    LOGGER.info("Loading pickle ...")
    cluster_path = get_cluster_path(dataset, language)
    clusters = cache.load_pickle(cluster_path)
    cluster_type_counts = defaultdict(int)
    for label, functions in clusters.items():
        if label == -1:
            continue
        contains_java = False
        contains_python = False
        for func in functions:
            if func.source == "java":
                contains_java = True
            elif func.source == "python":
                contains_python = True
        if contains_python and contains_java:
            cluster_type_counts["mixed"] += 1
        elif contains_java:
            cluster_type_counts["java"] += 1
        elif contains_python:
            cluster_type_counts["python"] += 1
    print(cluster_type_counts)
Пример #9
0
def load_clusters_from_pkl(file_name):
    return cache.load_pickle(file_name)