def _execute(self, data_bundle, from_node):
     code = data_bundle["code"]
     lines = code.split("\n")
     if len(lines) > 1:
         try:
             root = Parser.spt_parse("\n".join(lines[:-1]))
             features = FeatureExtraction.extract(root)
             self._emit(
                 DataBundle(
                     data_dict={
                         "features": features,
                         "line_number": len(lines) - 1,
                         "next_line": lines[-1]
                     }))
             logger.info("extract mapping succeeded, next line is: " +
                         lines[-1])
         except:
             logger.warn("extract mapping failed")
예제 #2
0
    def _execute(self, data_bundle):
        """
        collect all information in python codes
        """
        work_dir = data_bundle["work_dir"]
        for package_name in os.listdir(work_dir):
            package_path = os.path.join(work_dir, package_name)
            # check tmp directory first
            tmp_path = os.path.join(os.curdir, "tmp")
            if os.path.exists(tmp_path):
                shutil.rmtree(tmp_path)
            # file
            if os.path.isfile(package_path):
                if package_path.endswith(".py"):
                    self._handle_python_code(package_path)
            # directory
            elif os.path.isdir(package_path):
                for root, dirs, files in os.walk(package_path):
                    for f in files:
                        filepath = os.path.join(root, f)
                        if filepath.endswith(".py"):
                            self._handle_python_code(filepath)
            else:
                # extract package
                if tarfile.is_tarfile(package_path):
                    tar_file = tarfile.open(package_path)
                    tar_file.extractall(os.path.join(os.curdir, "tmp"))

                    extract_app_dir = os.path.join(os.curdir, "tmp",
                                                   os.listdir("tmp")[0])
                    self._handle_python_codes_in_app_dir(extract_app_dir)
                elif zipfile.is_zipfile(package_path):
                    zip_file = zipfile.ZipFile(package_path)
                    zip_file.extractall(os.path.join(os.curdir, "tmp"))

                    extract_app_dir = os.path.join(os.curdir, "tmp",
                                                   os.listdir("tmp")[0])
                    self._handle_python_codes_in_app_dir(extract_app_dir)
                else:
                    logger.warn(
                        "{} could not be extracted".format(package_path))
            # double check tmp path
            if os.path.exists(tmp_path):
                shutil.rmtree(tmp_path)
예제 #3
0
    def _execute(self, data_bundle, from_node):
        code = data_bundle["code"]
        try:
            features = FeatureExtraction.extract(Parser.spt_parse(code))
            code_line_number = len(code.split("\n"))
            max_intersection, min_line_number_diff, ans = 0, 1000000000, ""

            for index_features, line_number, next_line in self._index_reader:
                intersection = len(features & index_features)
                if intersection > max_intersection or \
                        (intersection == max_intersection and abs(code_line_number - line_number) < min_line_number_diff):
                    max_intersection = intersection
                    min_line_number_diff = abs(code_line_number - line_number)
                    ans = next_line
            print(ans)
            logger.info(
                "index ranking succeeded, next line is: {}".format(ans))
        except:
            logger.warn("index ranking failed")
 def _execute(self, data_bundle, from_node):
     """
     parse source code, output source code and feature set
     """
     source_code = data_bundle["code"]
     filepath = data_bundle["filepath"]
     try:
         root = Parser.spt_parse(source_code)
         features = FeatureExtraction.extract(root)
         emit_data_bundle = DataBundle(data_dict={
             "code": source_code,
             "features": features,
             "filepath": filepath
         })
         self._emit(emit_data_bundle)
         logger.info(
             "parsing code succeeded, filepath: {}".format(filepath))
     except Exception as e:
         # if parsing failed, ignore it
         logger.warn(
             "parsing code pieces failed, filepath: {}".format(filepath))
예제 #5
0
 def _handle_python_code(self, filepath):
     """
     split the python codes into several chunks with different threshold
     """
     # Algorithm 1: sliding windows
     chunk_sizes = [1, 2, 5, 10, 20]
     code_piece_set = set()
     for chunk_size in chunk_sizes:
         lines = []
         with open(filepath, "r") as f:
             for line in f:
                 # preprocessing line string, remove \n and replace \t with four spaces
                 line = line.strip("\n").replace("\t", "    ")
                 # empty line and comments will be ignored
                 if not self._filter(line):
                     lines.append(line)
                     # maintain a sliding window
                     if len(lines) > chunk_size:
                         lines.pop(0)
                     code_piece = "\n".join(
                         string_util.left_padding_strings(lines))
                     # dedup code piece
                     if code_piece not in code_piece_set:
                         code_piece_set.add(code_piece)
                         data_bundle = DataBundle(data_dict={
                             "filepath": filepath,
                             "code": code_piece
                         })
                         self._emit(data_bundle)
     # Algorithm 2: check classes and functions
     text = open(filepath, "r").read()
     lines = text.split("\n")
     try:
         ast_root = ast.parse(text)
         class_linenos, function_linenos = [], []
         for node in ast.walk(ast_root):
             if isinstance(node, ast.ClassDef):
                 class_linenos.append(node.lineno - 1)
             elif isinstance(node, ast.FunctionDef):
                 function_linenos.append(node.lineno - 1)
         # collect classes and functions
         for lineno in (class_linenos + function_linenos):
             code_block = []
             left_padding = string_util.get_left_padding_spaces(
                 lines[lineno])
             code_block.append(lines[lineno])
             lineno += 1
             while lineno < len(lines) and \
                 (string_util.is_empty_string(lines[lineno]) or
                      (string_util.get_left_padding_spaces(lines[lineno]) > left_padding)):
                 if not string_util.is_empty_string(lines[lineno]):
                     code_block.append(lines[lineno])
                 lineno += 1
             code_piece = "\n".join(
                 string_util.left_padding_strings(code_block))
             # dedup code piece
             if code_piece not in code_piece_set:
                 data_bundle = DataBundle(data_dict={
                     "filepath": filepath,
                     "code": code_piece
                 })
                 self._emit(data_bundle)
     except:
         logger.warn("handle python file: {} failed".format(filepath))