def from_str(value): try: return CArchive.ArchiveItem(value) except ValueError: logger.warning( f"[!] Unknown item type found in archive with type code letter '{value}'" ) return CArchive.ArchiveItem.DATA
def from_int(value): try: return ZlibArchive.ArchiveItem(value) except ValueError: logger.warning( f"[!] Unknown item type found in ZlibArchive with type code number '{value}'" ) return ZlibArchive.ArchiveItem.DATA
def check_for_password_file(self): self.potential_keys = [] if hasattr(self, "archive_path"): dir_of_pyz = self.archive_path.parent else: dir_of_pyz = Path.cwd() key_file = dir_of_pyz / "pyimod00_crypto_key.pyc" if key_file.exists(): self.encrypted = True logger.debug( f"[+] Found ZlibArchive encryption key file at path {key_file}" ) crypto_key_filename: str # full path of try: ( crypto_key_filename, crypto_key_co, crypto_key_python_version, crypto_key_compilation_timestamp, crypto_key_magic_int, crypto_key_is_pypy, crypto_key_source_size, crypto_key_sip_hash, ) = disassemble_file(str(key_file), outstream=open(os.devnull, "w")) except Exception as e: logger.warning( f"[!] Could not disassemble file {key_file}. Received error: {e}" ) else: self.compilation_time = datetime.fromtimestamp( crypto_key_compilation_timestamp) for const_string in crypto_key_co.co_consts: if const_string and len(const_string) == 16: self.potential_keys.append(const_string) # If we couldn't decompile the file to see the consts, lets just search the raw bytes of the file # for the password if not self.potential_keys: with key_file.open("rb") as file_ptr: file_strings = utils.parse_for_strings(file_ptr.read()) s: str for s in file_strings: if len(s) >= 16 and "pyimod00_crypto_key" not in s: while len(s) >= 16: self.potential_keys.append(s[0:16]) s = s[1:] logger.info( f"[*] Found these potential PyInstaller PYZ Archive encryption keys: {self.potential_keys}" ) if not self.potential_keys: logger.error( f"[*] Encryption key file detected, however no password was able to be retrieved." )
def run(_args: List[str] = None) -> None: """Orchestrate the flow of the remap command. This is the entry-point of the remap command. It calls out to other routines and attempts to follow this high-level flow: 1. Check that program is running in sufficiently new Python environment, and parse any arguments 2. Determine what type of input was passed to program, which will ultimately decide what method remap uses to recover the opmap. 3. Attempt one of the opmap recovery methods (see documentation for more on these methods) 4. If the opmap was successfully recovered, validate it, then write it to a file. Parameters ---------- _args : List[str] If this function is being called from other Python code, remap flags and other command-line options can be passed in as a list. """ if sys.version_info < (3, 8): logger.critical( "[!] This tool can only be run in Python 3.8 or later.") sys.exit(1) utils.check_for_our_xdis() args: argparse.Namespace = _parse_args(_args) logging_options: Dict[str, Union[bool, os.PathLike]] = { "verbose": args.verbose, "quiet": args.quiet } pydecipher.set_logging_options(**logging_options) remapped_bytecode_path: pathlib.Path = pathlib.Path( args.remapped_bytecode_path).resolve() if args.output: output_dir: pathlib.Path = pathlib.Path(args.output.strip()).resolve() else: output_dir: pathlib.Path = pathlib.Path.cwd() output_dir = output_dir / f"remap_output_{utils.slugify(remapped_bytecode_path.name)}" # The following block sets up logging to a stringIO stream, which will # eventually be placed in a file. We don't immediately log to a file because # we don't want to leave a log file on disk unless the program succeeds. log_stream: io.StringIO = io.StringIO() log_stream__handler: logging.StreamHandler = logging.StreamHandler( log_stream) log_stream__handler.setFormatter(pydecipher.log_format) log_stream__handler.setLevel(logging.DEBUG) logger.addHandler(log_stream__handler) remappings: Dict[int, Dict[int, int]] = {} version: str = "" remapping_method: str = "" cli: str = " ".join(sys.argv) if not _args else " ".join(_args) if args.version: version = args.version if args.megafile: # Determine if argument is a version or a path if pathlib.Path(args.megafile).exists(): standard_bytecode_path: pathlib.Path = pathlib.Path(args.megafile) else: potential_version: str = args.megafile magic_num: int = bytecode.version_str_to_magic_num_int( potential_version) if magic_num: compiled_file: str for compiled_file in os.listdir( pathlib.Path(__file__).parent / "reference_files" / "compiled"): full_path_obj: pathlib.Path = ( pathlib.Path(__file__).parent / "reference_files" / "compiled" / compiled_file) infile: BinaryIO with full_path_obj.open("rb") as infile: if xdis.magics.magic2int(infile.read(4)) == magic_num: logger.info( f"[*] Found matching megafile for version {potential_version}" ) standard_bytecode_path: pathlib.Path = full_path_obj break if not standard_bytecode_path: logger.error( "[!] Something went wrong. remap could not find a standard compiled version of this megafile." ) # Next, find the path of the reference file sys.exit(1) remappings, version = megafile_remap(standard_bytecode_path, remapped_bytecode_path) remapping_method = "Megafile" elif args.opcode_file: remappings, version = opcode_constants_remap(remapped_bytecode_path, provided_version=version) remapping_method = "opcode.pyc constants-walking" elif args.standard_bytecode_path: standard_bytecode_path: pathlib.Path = pathlib.Path( args.standard_bytecode_path).resolve() utils.check_read_access(standard_bytecode_path) utils.check_read_access(remapped_bytecode_path) utils.check_write_access(output_dir) if not remapped_bytecode_path.is_dir(): raise ValueError( "The standard/default remapping method requires a directory containing Python bytecode files" ) if not standard_bytecode_path.is_dir(): raise ValueError( "If you are going to provide your own reference opcode set, it must be a directory of " "Python bytecode files") remappings, version = standard_pyc_remap(standard_bytecode_path, remapped_bytecode_path, version=version) remapping_method = "Diff'ing against standard library bytecode" elif args.check_remapping: # Here, remapped_bytecode_path is not actually bytecode, its a remapping # file. utils.check_read_access(remapped_bytecode_path) remapping_file: TextIO with remapped_bytecode_path.open() as remapping_file: try: remapping_json: Dict["str", Union[str, int]] = json.loads( remapping_file.read()) except json.decoder.JSONDecodeError as e: e: json.decoder.JSONDecodeError logger.error(f"Could not read remapping file with error: {e}") sys.exit(1) version = remapping_json["python_version"] remappings_list: Dict[str, Union[ bool, str, int]] = remapping_json["remapped_opcodes"] remapping_dict: Dict[str, int] = { d["opname"]: d["remapped_value"] for d in remappings_list } if bytecode.validate_opmap(version, remapping_dict): logger.info("[*] This opmap is valid.") return else: msg: str = "This opmap is not valid." if not logging_options["verbose"]: msg += " Run with --verbose flag for more information." logger.warning(f"[!] {msg}") sys.exit(1) if remappings: remappings: Dict[int, int] = fix_remapping_conflicts(remappings) remappings: Dict[int, Tuple[int, bool]] = fill_opmap_gaps(remappings, version) output_file_path: pathlib.Path = write_remapping_file( remappings, version, remapping_method, cli, output_dir=output_dir) logger.info( f"[*] Remapping file {output_file_path.name} written to {output_file_path.parent}." ) # If we successfully produced the remapping file, we want to also # include the logged output of remap. log_name: str = datetime.datetime.now().strftime( "log_%H_%M_%S_%b_%d_%Y.txt") log_file_ptr: TextIO with output_dir.joinpath(log_name).open("w") as log_file_ptr: log_file_ptr.write(log_stream.getvalue()) logging_options: Dict[str, Union[bool, os.PathLike]] = { "log_path": output_dir.joinpath(log_name) } pydecipher.set_logging_options(**logging_options) else: logger.warning( "[!] Remap couldn't produce the new opmap. Run with --verbose for more information." ) sys.exit(1)
def standard_pyc_remap( standard_bytecode_path: pathlib.Path, remapped_bytecode_path: pathlib.Path, version: str = None) -> Tuple[Dict[int, Dict[int, int]], str]: """Diff compiled code objects from standard library and modified interpreter to try and recreate opcode mappings. This method is similar to the megafile method, but at a larger scale. See the remap documentation for more information on this method. Parameters ---------- standard_bytecode_path: pathlib.Path The path on disk to the reference set of standard-compiled bytecode. The version of Python for the reference set must correspond to the version of Python used as a base for the modified interpreter. remapped_bytecode_path: pathlib.Path The path on disk to the set of bytecode compiled by the modified interpreter version: str, optional The version of Python that this opcode file corresponds to. Returns ------- Tuple[Dict[int, Dict[int, int]], str] A tuple containing a dictionary of original_opcode to Dict[replacement_opcode:replacement_count] and the opmap's Python version. replacement_opcode is an opcode that was seen in place of original_opcode, and the replacement_count is the amount of times it was seen replacing the original_opcode throughout all the bytecode that was analyzed. """ reference_files: Dict[str, List[pathlib.Path]] = {} determined_version: str = "" pyc_file: pathlib.Path for pyc_file in standard_bytecode_path.rglob("*.pyc"): pyc_file_name: str = pyc_file.name.split(".")[0] if pyc_file_name == "__init__": continue if not determined_version: try: infile: BinaryIO with pyc_file.open("rb") as infile: pyc_magic_bytes: bytes = infile.read(4) version_set: Set[str] = copy.deepcopy( xdis.magics.by_magic[pyc_magic_bytes]) determined_version = version_set.pop() except Exception: pass else: logger.debug( f"Determined version {determined_version} from reference bytecode." ) if version and bytecode.version_str_to_magic_num_int( determined_version ) != bytecode.version_str_to_magic_num_int(version): logger.warning( f"Provided version {version} does not equal the version determined in the reference pyc " f"set ({determined_version}). We will proceed with the version you provided." ) if pyc_file_name in reference_files: reference_files[pyc_file_name].append(pyc_file) else: reference_files[pyc_file_name] = [pyc_file] if not version: version = determined_version remapped_files: Dict[str, List[pathlib.Path]] = {} for pyc_file in remapped_bytecode_path.rglob("*"): if not pyc_file.is_file(): continue try: kwargs: Dict[str, str] = {"version_hint": version} artifact_types.pyc.Pyc(pyc_file, **kwargs) except TypeError: continue pyc_file_name: str = pyc_file.name.split(".")[0] if pyc_file_name == "__init__": # Too common a filename, causes more problems than its worth to try to include these # since they are usually empty anyway. continue if pyc_file_name in remapped_files: remapped_files[pyc_file_name].append(pyc_file) else: remapped_files[pyc_file_name] = [pyc_file] master_remapping_counts: Dict[int, Dict[int, int]] = {} pyc_filename: str list_of_filepaths: List[pathlib.Path] for pyc_filename, list_of_filepaths in remapped_files.items(): if pyc_filename not in reference_files: continue pyc_filepath: pathlib.Path for pyc_filepath in list_of_filepaths: reference_file: pathlib.Path = None highest_similarity: int = 0 ref_pyc_filepath: pathlib.Path for ref_pyc_filepath in reference_files[pyc_filename]: relative_reference_filepath: str = str( ref_pyc_filepath.relative_to(standard_bytecode_path)) relative_remapped_filepath: str = str( pyc_filepath.relative_to(remapped_bytecode_path)) path_similarity: float = textdistance.lcsstr.normalized_similarity( relative_reference_filepath, relative_remapped_filepath) if path_similarity > highest_similarity: highest_similarity = path_similarity reference_file = ref_pyc_filepath if not reference_file: continue fixed_pyc_file: tempfile.NamedTemporaryFile if fixed_pyc_file := artifact_types.pyc.Pyc.check_and_fix_pyc( pyc_filepath, provided_version=version): logger.debug( f"[+] Duplicated file {str(pyc_filepath)} to correct issues with the pyc. New filepath:" f" {fixed_pyc_file.name}") pyc_filepath = fixed_pyc_file.name try: remapped_filename: str remapped_co: CodeType # can also be xdis codetypes remapped_version: float remapped_timestamp: int remapped_magic_int: int remapped_is_pypy: bool remapped_source_size: int remapped_sip_hash: str ( remapped_filename, remapped_co, remapped_version, remapped_timestamp, remapped_magic_int, remapped_is_pypy, remapped_source_size, remapped_sip_hash, ) = xdis.disasm.disassemble_file(str(pyc_filepath), header=True, outstream=open( os.devnull, "w")) reference_filename: str reference_co: CodeType # can also be xdis codetypes reference_version: float reference_timestamp: int reference_magic_int: int reference_is_pypy: bool reference_source_size: int reference_sip_hash: str ( reference_filename, reference_co, reference_version, reference_timestamp, reference_magic_int, reference_is_pypy, reference_source_size, reference_sip_hash, ) = xdis.disasm.disassemble_file(str(reference_file), outstream=open( os.devnull, "w")) except Exception: continue version = str(reference_version) try: remappings: Dict[int, int] = bytecode.diff_opcode( reference_co, remapped_co, version) except RuntimeError: continue # merge these remappings into the larger dictionary. opcode_val: int remap_options: Dict[int, int] for opcode_val, remap_options in remappings.items(): if opcode_val in master_remapping_counts: remap_option: int count: int for remap_option, count in remap_options.items(): if remap_option in master_remapping_counts[opcode_val]: master_remapping_counts[opcode_val][ remap_option] += count else: master_remapping_counts[opcode_val][ remap_option] = count else: master_remapping_counts[opcode_val] = remap_options
def diff_opcode(code_standard: CodeType, code_remapped: CodeType, version: str = None) -> Dict[int, Dict[int, int]]: """Calculate remapped opcodes from two Code objects of the same sourcecode. Parameters ---------- code_standard : Code (xdis.CodeX or types.CodeType) The standard-opcode Code object code_remapped : Code (xdis.CodeX or types.CodeType) The remapped-opcode Code object version : str, optional The Python version that marshaled the former two arguments. Used for figuring out what operations push arguments to the stack. Returns ------- Dict[int, Dict[int, int]] A dictionary of original_opcode to Dict[replacement_opcode:replacement_count]. replacement_opcode is an opcode that was seen in place of original_opcode, and the replacement_count is the amount of times it was seen replacing the original_opcode throughout all the bytecode that was analyzed. Raises ------ RuntimeError Args aren't correct type or differ in total opcode count too much. """ def _recursively_extract_all_code_objects(co) -> List[bytes]: """Co is a code object, with potentially nested code objects.""" co_code_objects: List[bytes] = [co.co_code] search_list: List[Union[Any]] = list(co.co_consts) co_obj: Any for co_obj in search_list: if iscode(co_obj): if co_obj not in co_code_objects: co_code_objects.append(co_obj.co_code) search_list.extend(co_obj.co_consts) return co_code_objects def _build_opcode_index(co_code_objects, HAVE_ARGUMENT=90, version: str = None) -> List[int]: """Build a list of opcodes contained within the list of co_code objects.""" # Helpful for learning about opcode + arg length: # https://laike9m.com/blog/demystifying-extended_arg,124/ if iscode(co_code_objects): co_code_objects: List[bytes] = [co_code_objects] opcode_index: List[int] = [] co_code: bytes for co_code in co_code_objects: i: int = 0 while i < len(co_code): incrementer: int = 1 opcode: int = co_code[i] if opcode >= HAVE_ARGUMENT: incrementer = 3 opcode_index.append(opcode) if version and float(version[:3]) >= 3.6: # After 3.6 all opcodes are two bytes, and the second byte # is empty if the opcode doesn't take an argument. incrementer = 2 i += incrementer return opcode_index if not iscode(code_standard) or not iscode(code_remapped): raise RuntimeError( "diff_opcode requires two Code objects as arguments") HAVE_ARGUMENT: int = 90 if version: try: xdis_opcode: ModuleType = xdis.main.get_opcode( version, is_pypy=("pypy" in version)) except TypeError: logger.warning( "[!] Couldn't retrieve version {version}'s opcodes from xdis.") else: HAVE_ARGUMENT = xdis_opcode.HAVE_ARGUMENT standard_code_objects: List[bytes] = _recursively_extract_all_code_objects( code_standard) remapped_code_objects: List[bytes] = _recursively_extract_all_code_objects( code_remapped) standard_opcodes_list: List[int] = _build_opcode_index( standard_code_objects, HAVE_ARGUMENT, version=version) remapped_opcodes_list: List[int] = _build_opcode_index( remapped_code_objects, HAVE_ARGUMENT, version=version) if abs(len(standard_opcodes_list) - len(remapped_opcodes_list)): # This is to prevent cases where files are being compared that don't # share source code raise RuntimeError( "The two co_code objects differ in length and therefore cannot do a comparison of the opcodes." ) i: int remappings: Dict[int, Dict[int, int]] = {} for i, remapped_opcode in enumerate(remapped_opcodes_list): if standard_opcodes_list[i] in remappings: existing_remap_options: Dict[int, int] = remappings[ standard_opcodes_list[i]] if remapped_opcode in existing_remap_options: existing_remap_options[remapped_opcode] += 1 else: existing_remap_options[remapped_opcode] = 1 else: remappings[standard_opcodes_list[i]] = {remapped_opcode: 1} return remappings
def decompile_pyc( arg_tuple: Tuple[pathlib.Path, Dict[str, int], Dict[str, Union[bool, os.PathLike]]] ) -> str: """Decompile a single Python bytecode file. Parameters ---------- arg_tuple: Tuple[pathlib.Path, Dict[str, int], Dict[str, Union[bool, os.PathLike]]] A tuple containing the arguments for this function. This is a tuple because pebble's Pool.map() function couldn't pass multiple arguments to a subprocessed function call. The tuple entries correspond to the following arguments: pyc_file : pathlib.Path The path to the compiled Python file alternate_opmap : Dict[str, int], optional If this bytecode file was produced by an interpreter with remapped opcodes, you must provide the opmap as a OPNAME: OPCODE dictionary logging_options: Dict[str, Union[bool, os.PathLike], optional A dictionary of logging options. This is only needed when pydecipher is performing multi-processed decompilation. The keys can be the following strings: verbose: bool True will enable verbose logging. quiet: bool True will silence all console logging. log_path: pathlib.Path If a path object is passed in as the log_path, the running instance of pydecipher will continue logging to that file. Returns ------- str There are several different return values: * **no_action**: This file was not decompiled. * **success**: This file was successfully decompiled. * **error**: This file could not be decompiled 100% successfully. * **opcode_error**: The error message returned by uncompyle6 indicates this file may have remapped opcodes """ pyc_file: pathlib.Path = arg_tuple[0] alternate_opmap: Dict[str, int] = arg_tuple[1] or None logging_options: Dict[str, Union[bool, os.PathLike]] = arg_tuple[2] or None if not pyc_file.is_file(): return "no_action" # Because this function runs in a new pydecipher process entirely, logging # options set during runtime (from command-line flags) do not carry over # automatically. We must pass these through manually, and reset the options # for this specific process. if logging_options and not pydecipher.log_path: pydecipher.set_logging_options(**logging_options) hijacked_stdout: io.StringIO = io.StringIO() hijacked_stderr: io.StringIO = io.StringIO() with redirect_stdout(hijacked_stdout), redirect_stderr(hijacked_stderr): # Chop off c in pyc new_file_name: pathlib.Path = pathlib.Path( str(pyc_file.resolve())[:-1]) # This prohibits the overwriting of existing files. # if new_file_name.exists() and new_file_name.stat().st_size: # return "no_action" logger.debug( f"[*] Decompiling file {pyc_file} of size {pyc_file.stat().st_size}" ) if not alternate_opmap: try: uncompyle6.decompile_file(str(pyc_file), outstream=sys.stdout) except uncompyle6.semantics.parser_error.ParserError as e: logger.warning(f"[!] Failed to decompile file {pyc_file}") if REMAPPED_OPCODE_ERROR_REGEX.match(str(e.error)): logger.error( f"[!] {pyc_file.name} failed to decompile with an error that indicate its opcode " "mappings may have been remapped to prevent analysis.") return "opcode_error" return "error" except Exception as e: e: Exception logger.error( f"[!] Failed to decompile file {pyc_file} with error: {e}") stdout_val: str = hijacked_stdout.getvalue() if stdout_val: with new_file_name.open("w") as file_ptr: file_ptr.write(stdout_val) return "error" else: with new_file_name.open("w") as file_ptr: file_ptr.write(hijacked_stdout.getvalue()) logger.info(f"[+] Successfully decompiled {pyc_file}") return "success" else: filename: str co: CodeType # can also be xdis.Code* objects version: float timestamp: int # seconds since epoch magic_int: int is_pypy: bool source_size: int sip_hash: str try: ( filename, co, version, timestamp, magic_int, is_pypy, source_size, sip_hash, ) = xdis.main.disassemble_file(str(pyc_file), outstream=open(os.devnull, "w"), alternate_opmap=alternate_opmap) output_file: TextIO with new_file_name.open(mode="w") as output_file: uncompyle6.main.decompile( version, co, timestamp=timestamp, source_size=source_size, magic_int=magic_int, is_pypy=is_pypy, out=output_file, ) except Exception as e: e: Exception logger.info( f"[!] Failed to decompile file {pyc_file} with error: {e}") return "error" else: logger.info(f"[+] Successfully decompiled {pyc_file}") return "success"
def process_pycs(pyc_iterable: Iterable[os.PathLike], alternate_opmap: Dict[str, int] = None) -> None: """Multi-processed decompilation orchestration of compiled Python files. Currently, pydecipher uses `uncompyle6`_ as its decompiler. It works well with `xdis`_ (same author) and allows for the decompilation of Code objects using alternate opmaps (with our extension of xdis). This function will start up CPU count * 2 pydecipher processes to decompile the given Python. Attempts to check for debugger, in which case the decompilation will be single-threaded to make debugging easier. .. _uncompyle6: https://github.com/rocky/python-uncompyle6/ .. _xdis: https://github.com/rocky/python-xdis Parameters ---------- pyc_iterable : Iterable[os.PathLike] An iterable of pathlib.Path objects, referencing compiled Python files to decompile. alternate_opmap : Dict[str, int], optional An opcode map of OPNAME: OPCODE (i.e. 'POP_TOP': 1). This should be a complete opmap for the Python version of the files being decompiled. Even if only two opcodes were swapped, the opcode map passed in should contain all 100+ Python bytecode operations. """ # This checks if the PyCharm debugger is attached. if sys.gettrace(): # Single-threaded for easier debugging. logger.debug( "[!] Debugger detected, not using multiprocessing for decompilation of pyc files." ) return_status_codes: List[str] = [] pyc_file: pathlib.Path for pyc_file in pyc_iterable: return_status_codes.append( decompile_pyc((pyc_file, alternate_opmap, pydecipher.get_logging_options()))) else: return_status_codes: List[str] = [] pool: pebble.ProcessPool with pebble.ProcessPool(os.cpu_count() * 2) as pool: iterables = [(pyc, alternate_opmap, pydecipher.get_logging_options()) for pyc in pyc_iterable] future: pebble.ProcessMapFuture = pool.map(decompile_pyc, iterables, timeout=300) iterator: Iterable = future.result() index: int = 0 while True: try: result: Any = next(iterator) return_status_codes.append(result) except StopIteration: break except TimeoutError as e: e: TimeoutError failed_pyc_path: str = str(iterables[index][0]) logger.error( f"[!] Timed out ({e.args[1]}s) trying to decompile {failed_pyc_path}." ) return_status_codes.append("error") except pebble.ProcessExpired as e: e: pebble.ProcessExpired logger.error( f"[!] Failed to decompile {failed_pyc_path} (process expired with status code {e.exitcode}." ) return_status_codes.append("error") except Exception as e: e: Exception logger.error( f"[!] Failed to decompile {failed_pyc_path} with unknown error: {e}" ) return_status_codes.append("error") finally: index += 1 successes: int = return_status_codes.count("success") opcode_errors: int = return_status_codes.count("opcode_error") errors: int = return_status_codes.count("error") + opcode_errors if opcode_errors: logger.warning( f"[!] {opcode_errors} file(s) failed to decompile with an error " "that indicate its opcode mappings may have been remapped. Try using" "`remap` on this set of bytecode.") if successes and not errors: logger.info(f"[+] Successfully decompiled {successes} .pyc files.") elif successes and errors: logger.warning( f"[!] Successfully decompiled {successes} .pyc files. Failed to decompile {errors} files. " "See log for more information.") elif not successes and errors: logger.error( f"[!] Failed to decompile all {errors} .pyc files. See log for more information." ) else: logger.warning( "[!] No pyc files were decompiled. See log for more information.")
def run(args_in: List[str] = None) -> None: """Orchestrate the flow of the pydecipher command. This function is the entry-point of the pydecipher command. It calls out to other routines and generally attempts to follow this high-level flow: 1. Parse program arguments. 2. Check that input files are readable and output locations are writeable, including that the the program is running in a sufficiently new Python environment (3.6+). 3. Recursively call unpack on the artifact until all items of interest are extracted. 4. Decompile any Python bytecode found through the unpacking process. Parameters ---------- args_in : List[str] If this function is being called from other Python code, pydecipher flags and other command-line options can be passed in as a list. """ if sys.version_info < (3, 8): logger.critical( "[!] This tool can only be run in Python 3.8 or later.") sys.exit(1) utils.check_for_our_xdis() args: argparse.Namespace = _parse_args(args_in) logging_options: Dict[str, Union[bool, os.PathLike]] = { "verbose": args.verbose, "quiet": args.quiet } pydecipher.set_logging_options(**logging_options) artifact_path: pathlib.Path = pathlib.Path(args.artifact_path).resolve() utils.check_read_access(artifact_path) relocate_pys: bool = False pyc_files: Iterable[os.PathLike] = [] if args.output: output_dir: pathlib.Path = pathlib.Path(args.output.strip()).resolve() if artifact_path.is_dir(): relocate_pys = True elif artifact_path.is_dir(): output_dir = artifact_path relocate_pys = True else: output_dir: pathlib.Path = ( pathlib.Path.cwd() / f"pydecipher_output_{utils.slugify(artifact_path.name.split('.')[0])}" ) if artifact_path.is_file() and os.path.splitext( artifact_path)[1].lower() in (".pyc", ".pyo"): relocate_pys = True pyc_files = [artifact_path] # The following block sets up logging to a stringIO stream, which will # eventually be placed in a file. We don't immediately log to a file # because we don't want to leave a log file on disk unless the program # succeeds, at least past the 'unpack' call. log_stream: io.StringIO = io.StringIO() log_stream__handler: logging.StreamHandler = logging.StreamHandler( log_stream) log_stream__handler.setFormatter(pydecipher.log_format) log_stream__handler.setLevel(logging.DEBUG) logger.addHandler(log_stream__handler) version_hint: str = args.version_hint alternate_opmap: Dict[str, int] = None if args.remapping_file: remap_file: pathlib.Path = pathlib.Path(args.remapping_file).resolve() logger.info(f"[*] Using remap file {remap_file}") utils.check_read_access(remap_file) alternate_opmap: Dict[str, int] = bytecode.create_opmap_from_file( remap_file) with remap_file.open("r") as remapping_file: file_json: str = json.loads(remapping_file.read()) remap_file_version: str = file_json["python_version"] version_hint = remap_file_version utils.check_write_access(output_dir) # Dump all pyc files if artifact_path.is_dir(): kwargs: Dict[str, str] = {"version_hint": version_hint} dirpath: str dirnames: List[str] filenames: List[str] for (dirpath, dirnames, filenames) in os.walk(artifact_path): filename: str for filename in filenames: if os.path.splitext(filename)[1].lower() in (".pyc", ".pyo"): full_path: pathlib.Path = pathlib.Path(dirpath).joinpath( filename) try: pyc_class_obj: artifact_types.pyc.Pyc = artifact_types.pyc.Pyc( full_path, output_dir=full_path.parent, **kwargs) except TypeError: pass else: pyc_class_obj.unpack() pyc_files: List[pathlib.Path] = list( artifact_path.rglob("*.[pP][yY][cCoO]")) else: unpack(artifact_path, output_dir=str(output_dir), version_hint=version_hint) # If we produced files, we want to also include the logged output of # pydecipher. If we didn't produce anything, we can assume the program # failed/had uninteresting output that doesn't need to be kept. The one # exception to this is when we pass in a single pyc file, or a directory of # pyc files, to be decompiled. if (output_dir.exists() and os.listdir(output_dir)) or pyc_files: output_dir.mkdir(parents=True, exist_ok=True) log_name: str = datetime.datetime.now().strftime( "log_%H_%M_%S_%b_%d_%Y.txt") with output_dir.joinpath(log_name).open("w") as log_file_ptr: log_file_ptr.write(log_stream.getvalue()) logging_options: Dict[str, pathlib.Path] = { "log_path": output_dir.joinpath(log_name) } pydecipher.set_logging_options(**logging_options) else: logger.warning("[!] This artifact produced no additional output.") return # Determine which pyc files to decompile if not pyc_files: pyc_files: Generator[os.PathLike, None, None] = output_dir.rglob("*.[pP][yY][cCoO]") if not args.decompile_all: max_depth: int = 10 # Search output directory with increasing recursive depth to find # first level of directories with .pyc files depth: int for depth in range(max_depth): tmp: List[os.PathLike] = list( pydecipher.utils.rglob_limit_depth(output_dir, "*.[pP][yY][cCoO]", depth)) if tmp: pyc_files = tmp break # Dispatch a pool of processes to decompile the specified group of pyc files bytecode.process_pycs(pyc_files, alternate_opmap=alternate_opmap) # If any decompiled python needs to be moved to the output directory, do # that now. This will only happen if the user passed in a pyc artifact # (single file or dir). We decompile the .pyc file into a .py file alongside # the .pyc file on disk, then move it to the designated output directory. if artifact_path.is_file(): relative_root: pathlib.Path = artifact_path.parent else: relative_root: pathlib.Path = artifact_path if relocate_pys: pyc_file: pathlib.Path for pyc_file in pyc_files: py_file: pathlib.Path = pathlib.Path(str(pyc_file)[:-1]) if not py_file.exists(): continue rel_path: pathlib.Path = py_file.relative_to(relative_root) new_filepath: pathlib.Path = output_dir.joinpath(rel_path) py_file.rename(new_filepath) # Perform any cleanup functions on output of decompilation pydecipher.artifact_types.py2exe.PYTHONSCRIPT.cleanup(output_dir)
def extract_files(self): magic_nums: set = set() decompression_errors = 0 successfully_extracted = 0 entry: CTOCEntry for entry in self.toc: data = self.archive_contents[entry. entry_offset:entry.entry_offset + entry.compressed_data_size] if entry.compression_flag: try: data = zlib.decompress(data) except zlib.error as e: decompression_errors += 1 logger.debug( f"[!] PyInstaller CArchive decompression failed with error: {e}" ) continue else: if len(data) != entry.uncompressed_data_size: logger.warning( f"[!] {entry.name} entry in CArchive listed its uncompressed data size as" f" {entry.uncompressed_data_size}, however in actuality, uncompressed to be {len(data)}" " bytes. This may be a sign that the CArchive was manually altered." ) if "\\" in entry.name: tmp: PureWindowsPath = pathlib.PureWindowsPath(entry.name) else: tmp: Path = Path(entry.name) file_path = pathlib.Path(self.output_dir).joinpath(tmp) if len(file_path.parents) > 1: # every path has '.' as a parent file_path.parent.mkdir(parents=True, exist_ok=True) if entry.type_code == self.ArchiveItem.PYSOURCE: if ord(data[:1]) == ord(xdis.marsh.TYPE_CODE) or ord( data[:1]) == (ord(xdis.marsh.TYPE_CODE) | xdis.unmarshal.FLAG_REF): file_path = file_path.parent / (file_path.name + ".pyc") if len(magic_nums) > 1: magic_num = next(iter(magic_nums)) logger.warning( "[!] More than one magic number found within this CArchive. Using magic number" f" {magic_num}, but also found numbers: {magic_nums}" ) elif len(magic_nums) == 0: logger.warning( f"[!] No magic numbers have been found yet, queueing this file for later." ) # TODO: add this file to a do-later list, when you know the magic num #TODO does this actually happen? dig deeper... pass data = pydecipher.bytecode.create_pyc_header( next(iter(magic_nums))) + data else: file_path = file_path.parent / (file_path.name + ".py") if "pyi" not in entry.name: logger.info( f"[!] Potential entrypoint found at script {entry.name}.py" ) elif entry.type_code == self.ArchiveItem.PYMODULE: magic_bytes = data[:4] # Python magic value magic_nums.add(magic2int(magic_bytes)) file_path = file_path.parent / (file_path.name + ".pyc") if entry.type_code != self.ArchiveItem.RUNTIME_OPTION: self.output_dir.mkdir(parents=True, exist_ok=True) with file_path.open(mode="wb") as f: f.write(data) successfully_extracted += 1 if entry.type_code in (self.ArchiveItem.PYZ, self.ArchiveItem.ZIPFILE): output_dir_name = (str( file_path.parent.joinpath( utils.slugify(file_path.name.split(".")[0]))) + "_output") pydecipher.unpack(file_path, output_dir=output_dir_name) if decompression_errors: logger.debug( f"[!] Failed to write {decompression_errors} files due to decompression errors." ) if successfully_extracted: logger.info( f"[+] Successfully extracted {successfully_extracted} files from this CArchive." )
def parse_toc(self): # Read CArchive cookie if self.pyinstaller_version == 2.0 or self.pyinstaller_version == "unknown": try: ( magic, self.length_of_package, self.toc_offset, self.toc_size, self.python_version, ) = struct.unpack( "!8siiii", self.archive_contents[self.magic_index:self.magic_index + self.PYINST20_COOKIE_SIZE], ) except: pass else: self.pyinstaller_version = 2.0 if self.pyinstaller_version == 2.1 or self.pyinstaller_version == "unknown": try: ( magic, self.length_of_package, self.toc_offset, self.toc_size, self.python_version, self.python_dynamic_lib, ) = struct.unpack( "!8siiii64s", self.archive_contents[self.magic_index:self.magic_index + self.PYINST21_COOKIE_SIZE], ) except: pass else: self.pyinstaller_version = 2.1 if self.python_dynamic_lib: self.python_dynamic_lib = self.python_dynamic_lib.decode( "ascii").rstrip("\x00") if self.pyinstaller_version == "unknown": logger.warning( "[!] Could not parse CArchive because PyInstaller version is unknown." ) return self.python_version = float(self.python_version) / 10 logger.info( f"[*] This CArchive was built with Python {self.python_version}") logger.debug(f"[*] CArchive Package Size: {self.length_of_package}") logger.debug(f"[*] CArchive Python Version: {self.python_version}") if self.pyinstaller_version == 2.1: logger.debug( f"[*] CArchive Python Dynamic Library Name: {self.python_dynamic_lib}" ) self.toc = [] toc_bytes = self.archive_contents[self.toc_offset:self.toc_offset + self.toc_size] while toc_bytes: (entry_size, ) = struct.unpack("!i", toc_bytes[0:4]) name_length = entry_size - self.CTOCEntry.ENTRYLEN ( entry_offset, compressed_data_size, uncompressed_data_size, compression_flag, type_code, name, ) = struct.unpack(f"!iiiBB{name_length}s", toc_bytes[4:entry_size]) name = name.decode("utf-8").rstrip("\0") if name == "": name = str(uniquename()) logger.debug( f"[!] Warning: Found an unnamed file in CArchive. Using random name {name}" ) type_code = chr(type_code) self.toc.append( self.CTOCEntry( entry_offset, compressed_data_size, uncompressed_data_size, compression_flag, type_code, name, )) toc_bytes = toc_bytes[entry_size:] logger.debug( f"[*] Found {len(self.toc)} entries in this PyInstaller CArchive")