def _execute(self, data_bundle, from_node): code = data_bundle["code"] lines = code.split("\n") if len(lines) > 1: try: root = Parser.spt_parse("\n".join(lines[:-1])) features = FeatureExtraction.extract(root) self._emit( DataBundle( data_dict={ "features": features, "line_number": len(lines) - 1, "next_line": lines[-1] })) logger.info("extract mapping succeeded, next line is: " + lines[-1]) except: logger.warn("extract mapping failed")
def _execute(self, data_bundle): """ collect all information in python codes """ work_dir = data_bundle["work_dir"] for package_name in os.listdir(work_dir): package_path = os.path.join(work_dir, package_name) # check tmp directory first tmp_path = os.path.join(os.curdir, "tmp") if os.path.exists(tmp_path): shutil.rmtree(tmp_path) # file if os.path.isfile(package_path): if package_path.endswith(".py"): self._handle_python_code(package_path) # directory elif os.path.isdir(package_path): for root, dirs, files in os.walk(package_path): for f in files: filepath = os.path.join(root, f) if filepath.endswith(".py"): self._handle_python_code(filepath) else: # extract package if tarfile.is_tarfile(package_path): tar_file = tarfile.open(package_path) tar_file.extractall(os.path.join(os.curdir, "tmp")) extract_app_dir = os.path.join(os.curdir, "tmp", os.listdir("tmp")[0]) self._handle_python_codes_in_app_dir(extract_app_dir) elif zipfile.is_zipfile(package_path): zip_file = zipfile.ZipFile(package_path) zip_file.extractall(os.path.join(os.curdir, "tmp")) extract_app_dir = os.path.join(os.curdir, "tmp", os.listdir("tmp")[0]) self._handle_python_codes_in_app_dir(extract_app_dir) else: logger.warn( "{} could not be extracted".format(package_path)) # double check tmp path if os.path.exists(tmp_path): shutil.rmtree(tmp_path)
def _execute(self, data_bundle, from_node): code = data_bundle["code"] try: features = FeatureExtraction.extract(Parser.spt_parse(code)) code_line_number = len(code.split("\n")) max_intersection, min_line_number_diff, ans = 0, 1000000000, "" for index_features, line_number, next_line in self._index_reader: intersection = len(features & index_features) if intersection > max_intersection or \ (intersection == max_intersection and abs(code_line_number - line_number) < min_line_number_diff): max_intersection = intersection min_line_number_diff = abs(code_line_number - line_number) ans = next_line print(ans) logger.info( "index ranking succeeded, next line is: {}".format(ans)) except: logger.warn("index ranking failed")
def _execute(self, data_bundle, from_node): """ parse source code, output source code and feature set """ source_code = data_bundle["code"] filepath = data_bundle["filepath"] try: root = Parser.spt_parse(source_code) features = FeatureExtraction.extract(root) emit_data_bundle = DataBundle(data_dict={ "code": source_code, "features": features, "filepath": filepath }) self._emit(emit_data_bundle) logger.info( "parsing code succeeded, filepath: {}".format(filepath)) except Exception as e: # if parsing failed, ignore it logger.warn( "parsing code pieces failed, filepath: {}".format(filepath))
def _handle_python_code(self, filepath): """ split the python codes into several chunks with different threshold """ # Algorithm 1: sliding windows chunk_sizes = [1, 2, 5, 10, 20] code_piece_set = set() for chunk_size in chunk_sizes: lines = [] with open(filepath, "r") as f: for line in f: # preprocessing line string, remove \n and replace \t with four spaces line = line.strip("\n").replace("\t", " ") # empty line and comments will be ignored if not self._filter(line): lines.append(line) # maintain a sliding window if len(lines) > chunk_size: lines.pop(0) code_piece = "\n".join( string_util.left_padding_strings(lines)) # dedup code piece if code_piece not in code_piece_set: code_piece_set.add(code_piece) data_bundle = DataBundle(data_dict={ "filepath": filepath, "code": code_piece }) self._emit(data_bundle) # Algorithm 2: check classes and functions text = open(filepath, "r").read() lines = text.split("\n") try: ast_root = ast.parse(text) class_linenos, function_linenos = [], [] for node in ast.walk(ast_root): if isinstance(node, ast.ClassDef): class_linenos.append(node.lineno - 1) elif isinstance(node, ast.FunctionDef): function_linenos.append(node.lineno - 1) # collect classes and functions for lineno in (class_linenos + function_linenos): code_block = [] left_padding = string_util.get_left_padding_spaces( lines[lineno]) code_block.append(lines[lineno]) lineno += 1 while lineno < len(lines) and \ (string_util.is_empty_string(lines[lineno]) or (string_util.get_left_padding_spaces(lines[lineno]) > left_padding)): if not string_util.is_empty_string(lines[lineno]): code_block.append(lines[lineno]) lineno += 1 code_piece = "\n".join( string_util.left_padding_strings(code_block)) # dedup code piece if code_piece not in code_piece_set: data_bundle = DataBundle(data_dict={ "filepath": filepath, "code": code_piece }) self._emit(data_bundle) except: logger.warn("handle python file: {} failed".format(filepath))