def extract_with_fallback(location, target_dir, extractor1, extractor2): """ Extract archive at `location` to `target_dir` trying first `extractor1` function. If extract fails, attempt extraction again with the `extractor2` function. Return a list of warning messages. Raise exceptions on errors. Note: there are a few cases where the primary extractor for a type may fail and a secondary extractor will succeed. """ abs_location = os.path.abspath(os.path.expanduser(location)) abs_target_dir = unicode(os.path.abspath(os.path.expanduser(target_dir))) # attempt extract first to a temp dir temp_target1 = unicode(fileutils.get_temp_dir('extract1')) try: warnings = extractor1(abs_location, temp_target1) if TRACE: logger.debug('extract_with_fallback: temp_target1: %(temp_target1)r' % locals()) fileutils.copytree(temp_target1, abs_target_dir) except: try: temp_target2 = unicode(fileutils.get_temp_dir('extract2')) warnings = extractor2(abs_location, temp_target2) if TRACE: logger.debug('extract_with_fallback: temp_target2: %(temp_target2)r' % locals()) fileutils.copytree(temp_target2, abs_target_dir) finally: fileutils.delete(temp_target2) finally: fileutils.delete(temp_target1) return warnings
def get_temp_dir(self, sub_dir_path=None): """ Create a unique new temporary directory location. Create directories identified by sub_dir_path if provided in this temporary directory. Return the location for this unique directory joined with the sub_dir_path if any. """ # ensure that we have a new unique temp directory for each test run global test_run_temp_dir if not test_run_temp_dir: from scancode_config import scancode_root_dir test_tmp_root_dir = path.join(scancode_root_dir, 'tmp') # now we add a space in the path for testing path with spaces test_run_temp_dir = fileutils.get_temp_dir( base_dir=test_tmp_root_dir, prefix='scancode-tk-tests -') if on_linux and py2: test_run_temp_dir = fsencode(test_run_temp_dir) test_run_temp_subdir = fileutils.get_temp_dir( base_dir=test_run_temp_dir, prefix='') if sub_dir_path: # create a sub directory hierarchy if requested sub_dir_path = to_os_native_path(sub_dir_path) test_run_temp_subdir = path.join(test_run_temp_subdir, sub_dir_path) fileutils.create_dir(test_run_temp_subdir) return test_run_temp_subdir
def get_gem_metadata(location): """ Return the string content of the metadata of a .gem archive file at `location` or None """ extract_loc = None try: # Extract first level of tar archive extract_loc = fileutils.get_temp_dir(prefix='scancode-extract-') abs_location = abspath(expanduser(location)) archive.extract_tar(abs_location, extract_loc) # The gzipped metadata is the second level of archive. metadata = os.path.join(extract_loc, 'metadata') # or it can be a plain, non-gzipped file metadata_gz = metadata + '.gz' if os.path.exists(metadata): with open(metadata, 'rb') as met: content = met.read() elif os.path.exists(metadata_gz): content = archive.get_gz_compressed_file_content(metadata_gz) else: raise Exception('No gem metadata found in RubyGem .gem file.') return content finally: if extract_loc: fileutils.delete(extract_loc)
def try_to_extract(location, target_dir, extractor): """ Extract archive at `location` to `target_dir` trying the `extractor` function. If extract fails, just return without returning warnings nor raising exceptions. Note: there are a few cases where we want to attempt extracting something but do not care if this fails. """ abs_location = os.path.abspath(os.path.expanduser(location)) abs_target_dir = compat.unicode( os.path.abspath(os.path.expanduser(target_dir))) temp_target = compat.unicode( fileutils.get_temp_dir(prefix='extractcode-extract1-')) warnings = [] try: warnings = extractor(abs_location, temp_target) if TRACE: logger.debug('try_to_extract: temp_target: %(temp_target)r' % locals()) fileutils.copytree(temp_target, abs_target_dir) except: return warnings finally: fileutils.delete(temp_target) return warnings
def extract_file(location, target, kinds=extractcode.default_kinds): """ Extract a single archive at `location` in the `target` directory if it is of a kind supported in the `kinds` kind tuple. """ warnings = [] errors = [] extractor = archive.get_extractor(location, kinds) if TRACE: logger.debug('extract_file: extractor: for: %(location)r with kinds: %(kinds)r : ' % locals() + getattr(extractor, '__module__', '') + '.' + getattr(extractor, '__name__', '')) if extractor: yield ExtractEvent(location, target, done=False, warnings=[], errors=[]) try: # extract first to a temp directory. # if there is an error, the extracted files will not be moved # to target tmp_tgt = fileutils.get_temp_dir('extract') abs_location = abspath(expanduser(location)) warnings.extend(extractor(abs_location, tmp_tgt)) fileutils.copytree(tmp_tgt, target) fileutils.delete(tmp_tgt) except Exception, e: if TRACE: logger.debug('extract_file: ERROR: %(location)r: %(errors)r, %(e)r.\n' % locals()) errors = [str(e).strip(' \'"')] finally:
def extract_file(location, target, kinds=extractcode.default_kinds, verbose=False): """ Extract a single archive at `location` in the `target` directory if it is of a kind supported in the `kinds` kind tuple. """ warnings = [] errors = [] extractor = archive.get_extractor(location, kinds) if TRACE: logger.debug('extract_file: extractor: for: %(location)r with kinds: %(kinds)r : ' % locals() + getattr(extractor, '__module__', '') + '.' + getattr(extractor, '__name__', '')) if extractor: yield ExtractEvent(location, target, done=False, warnings=[], errors=[]) try: # extract first to a temp directory: if there is an error, the # extracted files will not be moved to target tmp_tgt = fileutils.get_temp_dir(prefix='scancode-extract-') abs_location = abspath(expanduser(location)) warns = extractor(abs_location, tmp_tgt) or [] warnings.extend(warns) fileutils.copytree(tmp_tgt, target) fileutils.delete(tmp_tgt) except Exception as e: errors = [str(e).strip(' \'"')] if verbose: errors.append(traceback.format_exc()) if TRACE: tb = traceback.format_exc() logger.debug('extract_file: ERROR: %(location)r: %(errors)r\n%(e)r\n%(tb)s' % locals()) finally: yield ExtractEvent(location, target, done=True, warnings=warnings, errors=errors)
def convert_to_utf8(location): """ Convert the file at location to UTF-8 text. Return the location of the converted file or None. """ if not get_type(location).is_text: return location start = open(location, 'rb').read(4096) encoding = chardet.detect(start) if encoding: encoding = encoding.get('encoding', None) if encoding: target = os.path.join(fileutils.get_temp_dir('markup'), fileutils.file_name(location)) with codecs.open(location, 'rb', encoding=encoding, errors='replace', buffering=16384) as inf: with codecs.open(target, 'wb', encoding='utf-8') as outf: outf.write(inf.read()) return target else: # chardet failed somehow to detect an encoding return location
def extract_file(location, target, kinds=extractcode.default_kinds): """ Extract a single archive at `location` in the `target` directory if it is of a kind supported in the `kinds` kind tuple. """ warnings = [] errors = [] extractor = archive.get_extractor(location, kinds) if DEBUG: logger.debug( 'extract_file: extractor: for: %(location)r with kinds: r(kinds)r : ' % locals() + getattr(extractor, '__module__', '') + '.' + getattr(extractor, '__name__', '')) if extractor: yield ExtractEvent(location, target, done=False, warnings=[], errors=[]) try: # extract first to a temp directory. # if there is an error, the extracted files will not be moved # to target tmp_tgt = fileutils.get_temp_dir('extract') abs_location = abspath(expanduser(location)) warnings.extend(extractor(abs_location, tmp_tgt)) fileutils.copytree(tmp_tgt, target) fileutils.delete(tmp_tgt) except Exception, e: if DEBUG: logger.debug( 'extract_file: ERROR: %(location)r: %(errors)r, %(e)r.\n' % locals()) errors = [str(e).strip(' \'"')] finally:
def download_url(url, file_name=None, verify=True): """ Return the temporary location of the file fetched at the remote url. Use file_name if provided or create a file name base on the last url segment. If verify is True, SSL certification is performed. Otherwise, no verification is done but a warning will be printed. """ requests_args = dict(timeout=10, verify=verify) file_name = file_name or fileutils.file_name(url) try: response = requests.get(url, **requests_args) except (ConnectionError, InvalidSchema) as e: logger.error('fetch: Download failed for %(url)r' % locals()) raise status = response.status_code if status != 200: msg = 'fetch: Download failed for %(url)r with %(status)r' % locals() logger.error(msg) raise Exception(msg) tmp_dir = fileutils.get_temp_dir(base_dir='fetch') output_file = os.path.join(tmp_dir, file_name) with open(output_file, 'wb') as out: out.write(response.content) return output_file
def download_url(url, file_name=None, verify=True, timeout=10): """ Fetch `url` and return the temporary location where the fetched content was saved. Use `file_name` if provided or create a new `file_name` base on the last url segment. If `verify` is True, SSL certification is performed. Otherwise, no verification is done but a warning will be printed. `timeout` is the timeout in seconds. """ requests_args = dict(timeout=timeout, verify=verify) file_name = file_name or fileutils.file_name(url) try: response = requests.get(url, **requests_args) except (ConnectionError, InvalidSchema) as e: logger.error('download_url: Download failed for %(url)r' % locals()) raise status = response.status_code if status != 200: msg = 'download_url: Download failed for %(url)r with %(status)r' % locals( ) logger.error(msg) raise Exception(msg) tmp_dir = fileutils.get_temp_dir(prefix='fetch-') output_file = os.path.join(tmp_dir, file_name) with open(output_file, 'wb') as out: out.write(response.content) return output_file
def uncompress_file(location, decompressor): """ Uncompress a compressed file at location and return a temporary location of the uncompressed file and a list of warning messages. Raise Exceptions on errors. Use the `decompressor` object for decompression. """ # FIXME: do not create a sub-directory and instead strip the "compression" # extension such gz, etc. or introspect the archive header to get the file # name when present. assert location assert decompressor warnings = [] base_name = fileutils.file_base_name(location) target_location = os.path.join(fileutils.get_temp_dir(base_dir='extract'), base_name) with decompressor(location, 'rb') as compressed: with open(target_location, 'wb') as uncompressed: buffer_size = 32 * 1024 * 1024 while True: chunk = compressed.read(buffer_size) if not chunk: break uncompressed.write(chunk) if getattr(decompressor, 'has_trailing_garbage', False): warnings.append(location + ': Trailing garbage found and ignored.') return target_location, warnings
def test_is_dir(self): test_dir = self.get_test_loc('symlink', copy=True) temp_dir = fileutils.get_temp_dir() test_link = join(temp_dir, 'test-dir-link') os.symlink(test_dir, test_link) assert filetype.is_dir(test_link, follow_symlinks=True) assert not filetype.is_dir(test_link, follow_symlinks=False)
def execute(cmd, args, root_dir=None, cwd=None, env=None, to_files=False): """ Run a `cmd` external command with the `args` arguments list and return the return code, the stdout and stderr. To avoid RAM exhaustion, always write stdout and stderr streams to files. If `to_files` is False, return the content of stderr and stdout as ASCII strings. Otherwise, return the locations to the stderr and stdout temporary files. Resolve the `cmd` location using os/arch local/vendored location based on using `root_dir`. No resolution is done if root_dir is None Run the command using the `cwd` current working directory with an `env` dict of environment variables. """ assert cmd cmd_loc, bin_dir, lib_dir = get_locations(cmd, root_dir) full_cmd = [cmd_loc or cmd] + args or [] env = get_env(env, lib_dir) or None cwd = cwd or curr_dir # temp files for stderr and stdout tmp_dir = fileutils.get_temp_dir(base_dir='cmd') sop = os.path.join(tmp_dir, 'stdout') sep = os.path.join(tmp_dir, 'stderr') # shell==True is DANGEROUS but we are not running arbitrary commands # though we can execute command that just happen to be in the path shell = True if on_windows else False logger.debug( 'Executing command %(cmd)r as %(full_cmd)r with: env=%(env)r, ' 'shell=%(shell)r, cwd=%(cwd)r, stdout=%(sop)r, stderr=%(sep)r.' % locals()) proc = None try: with open(sop, 'wb') as stdout, open(sep, 'wb') as stderr: # -1 defaults bufsize to system bufsize pargs = dict(cwd=cwd, env=env, stdout=stdout, stderr=stderr, shell=shell, bufsize=-1, universal_newlines=True) proc = subprocess.Popen(full_cmd, **pargs) stdout, stderr = proc.communicate() rc = proc.returncode if proc else 0 finally: close(proc) if not to_files: # return output as ASCII string loaded from the output files sop = text.toascii(open(sop, 'rb').read().strip()) sep = text.toascii(open(sep, 'rb').read().strip()) return rc, sop, sep
def get_scans_cache_class(cache_dir=scans_cache_dir): """ Return a new persistent cache class configured with a unique storage directory. """ # create a unique temp directory in cache_dir fileutils.create_dir(cache_dir) # ensure that the cache dir is alwasy unicode cache_dir = fileutils.get_temp_dir(unicode(cache_dir), prefix=unicode(timeutils.time2tstamp()) + u'-') sc = ScanFileCache(cache_dir) sc.setup() return partial(ScanFileCache, cache_dir)
def get_temp_dir(self, sub_dir_path=None): """ Create a unique new temporary directory location. Create directories identified by sub_dir_path if provided in this temporary directory. Return the location for this unique directory joined with the sub_dir_path if any. """ # ensure that we have a new unique temp directory for each test run global test_run_temp_dir if not test_run_temp_dir: test_run_temp_dir = fileutils.get_temp_dir(base_dir='tst', prefix=' ') new_temp_dir = fileutils.get_temp_dir(base_dir=test_run_temp_dir) if sub_dir_path: # create a sub directory hierarchy if requested sub_dir_path = to_os_native_path(sub_dir_path) new_temp_dir = os.path.join(new_temp_dir, sub_dir_path) fileutils.create_dir(new_temp_dir) return new_temp_dir
def execute(cmd, args, root_dir=None, cwd=None, env=None, to_files=False): """ Run a `cmd` external command with the `args` arguments list and return the return code, the stdout and stderr. To avoid RAM exhaustion, always write stdout and stderr streams to files. If `to_files` is False, return the content of stderr and stdout as ASCII strings. Otherwise, return the locations to the stderr and stdout temporary files. Resolve the `cmd` location using os/arch local/vendored location based on using `root_dir`. No resolution is done if root_dir is None Run the command using the `cwd` current working directory with an `env` dict of environment variables. """ assert cmd cmd_loc, bin_dir, lib_dir = get_locations(cmd, root_dir) full_cmd = [cmd_loc or cmd] + args or [] env = get_env(env, lib_dir) or None cwd = cwd or curr_dir # temp files for stderr and stdout tmp_dir = fileutils.get_temp_dir(base_dir='cmd') sop = os.path.join(tmp_dir, 'stdout') sep = os.path.join(tmp_dir, 'stderr') # shell==True is DANGEROUS but we are not running arbitrary commands # though we can execute command that just happen to be in the path shell = True if on_windows else False logger.debug('Executing command %(cmd)r as %(full_cmd)r with: env=%(env)r, ' 'shell=%(shell)r, cwd=%(cwd)r, stdout=%(sop)r, stderr=%(sep)r.' % locals()) proc = None try: with open(sop, 'wb') as stdout, open(sep, 'wb') as stderr: # -1 defaults bufsize to system bufsize pargs = dict(cwd=cwd, env=env, stdout=stdout, stderr=stderr, shell=shell, bufsize=-1, universal_newlines=True) proc = subprocess.Popen(full_cmd, **pargs) stdout, stderr = proc.communicate() rc = proc.returncode if proc else 0 finally: close(proc) if not to_files: # return output as ASCII string loaded from the output files sop = text.toascii(open(sop, 'rb').read().strip()) sep = text.toascii(open(sep, 'rb').read().strip()) return rc, sop, sep
def get_scans_cache_class(cache_dir=scans_cache_dir): """ Return a new persistent cache class configured with a unique storage directory. """ # create a unique temp directory in cache_dir fileutils.create_dir(cache_dir) prefix = timeutils.time2tstamp() + u'-' cache_dir = fileutils.get_temp_dir(cache_dir, prefix=prefix) if on_linux: cache_dir = path_to_bytes(cache_dir) sc = ScanFileCache(cache_dir) sc.setup() return partial(ScanFileCache, cache_dir)
def convert_to_text(location, _retrying=False): """ Convert the markup file at location to plain text. Return the location of the converted plain text file or None. """ if not is_markup(location): return temp_file = os.path.join(fileutils.get_temp_dir('markup'), 'text') from bs4 import BeautifulSoup with open(location, 'rb') as input_text: soup = BeautifulSoup(input_text.read(), 'html5lib') with codecs.open(temp_file, mode='wb', encoding='utf-8') as output_text: output_text.write(soup.get_text()) return temp_file
def extract_twice(location, target_dir, extractor1, extractor2): """ Extract a nested compressed archive at `location` to `target_dir` using the `extractor1` function to a temporary directory then the `extractor2` function on the extracted payload of `extractor1`. Return a list of warning messages. Raise exceptions on errors. Typical nested archives include compressed tarballs and RPMs (containing a compressed cpio). Note: it would be easy to support deeper extractor chains, but this gets hard to trace and debug very quickly. A depth of two is simple and sane and covers most common cases. """ if on_linux and py2: location = fileutils.fsencode(location) target_dir = fileutils.fsencode(target_dir) abs_location = os.path.abspath(os.path.expanduser(location)) abs_target_dir = compat.unicode( os.path.abspath(os.path.expanduser(target_dir))) # extract first the intermediate payload to a temp dir temp_target = compat.unicode( fileutils.get_temp_dir(prefix='extractcode-extract-')) warnings = extractor1(abs_location, temp_target) if TRACE: logger.debug('extract_twice: temp_target: %(temp_target)r' % locals()) # extract this intermediate payload to the final target_dir try: inner_archives = list( fileutils.resource_iter(temp_target, with_dirs=False)) if not inner_archives: warnings.append(location + ': No files found in archive.') else: for extracted1_loc in inner_archives: if TRACE: logger.debug( 'extract_twice: extractor2: %(extracted1_loc)r' % locals()) warnings.extend(extractor2(extracted1_loc, abs_target_dir)) finally: # cleanup the temporary output from extractor1 fileutils.delete(temp_target) return warnings
def extract_twice(location, target_dir, extractor1, extractor2): """ Extract a nested compressed archive at `location` to `target_dir` using the `extractor1` function to a temporary directory then the `extractor2` function on the extracted payload of `extractor1`. Return a list of warning messages. Raise exceptions on errors. Typical nested archives include compressed tarballs and RPMs (containing a compressed cpio). Note: it would be easy to support deeper extractor chains, but this gets hard to trace and debug very quickly. A depth of two is simple and sane and covers most common cases. """ if on_linux: location = path_to_bytes(location) target_dir = path_to_bytes(target_dir) abs_location = os.path.abspath(os.path.expanduser(location)) abs_target_dir = unicode(os.path.abspath(os.path.expanduser(target_dir))) # extract first the intermediate payload to a temp dir temp_target = unicode(fileutils.get_temp_dir('extract')) warnings = extractor1(abs_location, temp_target) if TRACE: logger.debug('extract_twice: temp_target: %(temp_target)r' % locals()) # extract this intermediate payload to the final target_dir try: inner_archives = list(fileutils.file_iter(temp_target)) if not inner_archives: warnings.append(location + ': No files found in archive.') else: for extracted1_loc in inner_archives: if TRACE: logger.debug('extract_twice: extractor2: %(extracted1_loc)r' % locals()) warnings.extend(extractor2(extracted1_loc, abs_target_dir)) finally: # cleanup the temporary output from extractor1 fileutils.delete(temp_target) return warnings
def convert_to_utf8(location): """ Convert the file at location to UTF-8 text. Return the location of the converted file or None. """ if not contenttype.get_type(location).is_text: return location start = open(location, 'rb').read(4096) encoding = chardet.detect(start) if encoding: encoding = encoding.get('encoding', None) if encoding: target = os.path.join(fileutils.get_temp_dir('markup'), fileutils.file_name(location)) with codecs.open(location, 'rb', encoding=encoding, errors='replace', buffering=16384) as inf: with codecs.open(target, 'wb', encoding='utf-8') as outf: outf.write(inf.read()) return target else: # chardet failed somehow to detect an encoding return location
def try_to_extract(location, target_dir, extractor): """ Extract archive at `location` to `target_dir` trying the `extractor` function. If extract fails, just return without returning warnings nor raising exceptions. Note: there are a few cases where we want to attempt extracting something but do not care if this fails. """ abs_location = os.path.abspath(os.path.expanduser(location)) abs_target_dir = unicode(os.path.abspath(os.path.expanduser(target_dir))) temp_target = unicode(fileutils.get_temp_dir('extract1')) warnings = [] try: warnings = extractor(abs_location, temp_target) if TRACE: logger.debug('try_to_extract: temp_target: %(temp_target)r' % locals()) fileutils.copytree(temp_target, abs_target_dir) except: return warnings finally: fileutils.delete(temp_target) return warnings
def _collect_and_parse_tags(self): ctags_args = ['--fields=K', '--c-kinds=fp', '-f', '-', self.sourcefile] ctags_temp_dir = fileutils.get_temp_dir(base_dir='ctags') envt = {'TMPDIR': ctags_temp_dir} try: rc, stdo, err = command.execute2(cmd_loc=self.cmd_loc, ctags_args, env=envt, lib_dir=self.lib_loc, to_files=True) if rc != 0: raise Exception(open(err).read()) with open(stdo, 'rb') as lines: for line in lines: if 'cannot open temporary file' in line: raise Exception('ctags: cannot open temporary file ' ': Permission denied') if line.startswith('!'): continue line = line.strip() if not line: continue splitted = line.split('\t') if (line.endswith('function\tfile:') or line.endswith('prototype\tfile:')): self.local_functions.append(splitted[0]) elif (line.endswith('function') or line.endswith('prototype')): self.global_functions.append(splitted[0]) finally: fileutils.delete(ctags_temp_dir)
def execute(cmd_loc, args, cwd=None, env=None, to_files=False, log=TRACE): """ Run a `cmd_loc` command with the `args` arguments list and return the return code, the stdout and stderr. To avoid RAM exhaustion, always write stdout and stderr streams to files. If `to_files` is False, return the content of stderr and stdout as ASCII strings. Otherwise, return the locations to the stderr and stdout temporary files. Run the command using the `cwd` current working directory with an `env` dict of environment variables. """ assert cmd_loc full_cmd = [cmd_loc] + (args or []) # any shared object should be either in the PATH, the rpath or # side-by-side with the exceutable cmd_dir = os.path.dirname(cmd_loc) env = get_env(env, lib_dir=cmd_dir) or None cwd = cwd or curr_dir # temp files for stderr and stdout tmp_dir = get_temp_dir(prefix='cmd-') sop = path.join(tmp_dir, 'stdout') sep = path.join(tmp_dir, 'stderr') # shell==True is DANGEROUS but we are not running arbitrary commands # though we can execute commands that just happen to be in the path # See why we need it on Windows https://bugs.python.org/issue8557 shell = True if on_windows else False if log: printer = logger.debug if TRACE else lambda x: print(x) printer( 'Executing command %(cmd_loc)r as:\n%(full_cmd)r\nwith: env=%(env)r\n' 'shell=%(shell)r\ncwd=%(cwd)r\nstdout=%(sop)r\nstderr=%(sep)r' % locals()) proc = None rc = 100 try: with io.open(sop, 'wb') as stdout, io.open(sep, 'wb') as stderr, pushd(cmd_dir): proc = subprocess.Popen( full_cmd, cwd=cwd, env=env, stdout=stdout, stderr=stderr, shell=shell, # -1 defaults bufsize to system bufsize bufsize=-1, universal_newlines=True, ) stdout, stderr = proc.communicate() rc = proc.returncode if proc else 0 finally: close(proc) if not to_files: # return output as ASCII string loaded from the output files with open(sop, 'rb') as so: sor = so.read() sop = text.toascii(sor).strip() with open(sep, 'rb') as se: ser = se.read() sep = text.toascii(ser).strip() return rc, sop, sep
def rebuild_rootfs(image, target_dir, layerid_len=DEFAULT_ID_LEN): """ Extract and merge all layers to target_dir. Extraction is done in sequence from bottom (root) to top (latest layer). Return a mapping of errors and a list of whiteouts/deleted files. The extraction process consists of these steps: - extract the layer in a temp directory - move layer to the target directory, overwriting existing files - if any, remove AUFS special files/dirs in the target directory - if any, remove whiteouts file/directory pairs in the target directory """ from extractcode.extract import extract_file assert filetype.is_dir(target_dir) assert os.path.exists(target_dir) extract_errors = [] # log whiteouts deletions whiteouts = [] for layer_id, layer in image.layers.items(): layer_tarball = join(image.repo_dir, layer_id[:layerid_len], LAYER_TAR_FILE) logger.debug('Extracting layer tarball: %(layer_tarball)r' % locals()) temp_target = fileutils.get_temp_dir('conan-docker') xevents = list(extract_file(layer_tarball, temp_target)) for x in xevents: if x.warnings or x.errors: extract_errors.extend(xevents) # FIXME: the order of ops is WRONG: we are getting whiteouts incorrectly # it should be: # 1. extract a layer to temp. # 2. find whiteouts in that layer. # 3. remove whiteouts in the previous layer stack (e.g. the WIP rootfs) # 4. finall copy the extracted layer over the WIP rootfs # move extracted layer to target_dir logger.debug( 'Moving extracted layer from: %(temp_target)r to: %(target_dir)r') fileutils.copytree(temp_target, target_dir) fileutils.delete(temp_target) logger.debug( 'Merging extracted layers and applying AUFS whiteouts/deletes') for top, dirs, files in fileutils.walk(target_dir): # delete AUFS dirs and apply whiteout deletions for dr in dirs[:]: whiteable_dir = join(top, dr) if dr.startswith(WHITEOUT_PREFIX): # delete the .wh. dir... dirs.remove(dr) logger.debug('Deleting whiteout dir: %(whiteable_dir)r' % locals()) fileutils.delete(whiteable_dir) # ... and delete the corresponding dir it does "whiteout" base_dir = dr[len(WHITEOUT_PREFIX):] try: dirs.remove(base_dir) except ValueError: # FIXME: should we really raise an exception here? msg = ('Inconsistent layers: ' 'missing directory to whiteout: %(base_dir)r' % locals()) raise InconsistentLayersError(msg) wdo = join(top, base_dir) logger.debug('Deleting real dir: %(wdo)r' % locals()) fileutils.delete(wdo) whiteouts.append(wdo) # delete AUFS special dirs elif dr.startswith(WHITEOUT_SPECIAL_DIR_PREFIX): dirs.remove(dr) logger.debug( 'Deleting AUFS special dir: %(whiteable_dir)r' % locals()) fileutils.delete(whiteable_dir) # delete AUFS files and apply whiteout deletions all_files = set(files) for fl in all_files: whiteable_file = join(top, fl) if fl.startswith(WHITEOUT_PREFIX): # delete the .wh. marker file... logger.debug('Deleting whiteout file: %(whiteable_file)r' % locals()) fileutils.delete(whiteable_file) # ... and delete the corresponding file it does "whiteout" # e.g. logically delete base_file = fl[len(WHITEOUT_PREFIX):] wfo = join(top, base_file) whiteouts.append(wfo) if base_file in all_files: logger.debug('Deleting real file: %(wfo)r' % locals()) fileutils.delete(wfo) # delete AUFS special files elif fl.startswith(WHITEOUT_SPECIAL_DIR_PREFIX): logger.debug( 'Deleting AUFS special file: %(whiteable_file)r' % locals()) fileutils.delete(whiteable_file) whiteouts.append(whiteable_file) return extract_errors, whiteouts
def execute2(cmd_loc, args, lib_dir=None, cwd=None, env=None, to_files=False, log=TRACE): """ Run a `cmd_loc` command with the `args` arguments list and return the return code, the stdout and stderr. To avoid RAM exhaustion, always write stdout and stderr streams to files. If `to_files` is False, return the content of stderr and stdout as ASCII strings. Otherwise, return the locations to the stderr and stdout temporary files. Run the command using the `cwd` current working directory with an `env` dict of environment variables. """ assert cmd_loc full_cmd = [cmd_loc] + (args or []) env = get_env(env, lib_dir) or None cwd = cwd or curr_dir # temp files for stderr and stdout tmp_dir = get_temp_dir(prefix='cmd-') if on_linux and py2: stdout = b'stdout' stderr = b'stderr' else: stdout = 'stdout' stderr = 'stderr' sop = path.join(tmp_dir, stdout) sep = path.join(tmp_dir, stderr) # shell==True is DANGEROUS but we are not running arbitrary commands # though we can execute commands that just happen to be in the path shell = True if on_windows else False if log: printer = logger.debug if TRACE else lambda x: print(x) printer( 'Executing command %(cmd_loc)r as:\n%(full_cmd)r\nwith: env=%(env)r\n' 'shell=%(shell)r\ncwd=%(cwd)r\nstdout=%(sop)r\nstderr=%(sep)r' % locals()) proc = None rc = 100 if py2: okwargs = dict(mode='wb') if py3: okwargs = dict(mode='w', encoding='utf-8') try: with io.open(sop, **okwargs) as stdout, io.open(sep, **okwargs) as stderr: with pushd(lib_dir): popen_args = dict( cwd=cwd, env=env, stdout=stdout, stderr=stderr, shell=shell, # -1 defaults bufsize to system bufsize bufsize=-1, universal_newlines=True ) proc = subprocess.Popen(full_cmd, **popen_args) stdout, stderr = proc.communicate() rc = proc.returncode if proc else 0 finally: close(proc) if not to_files: # return output as ASCII string loaded from the output files sop = text.toascii(open(sop, 'rb').read().strip()) sep = text.toascii(open(sep, 'rb').read().strip()) return rc, sop, sep
def extract_file_by_file(location, target_dir, arch_type='*', skip_symlinks=True): """ Extract all files using a one-by-one process from a 7zip-supported archive file at location in the `target_dir` directory. Return a list of warning messages if any or an empty list. Raise exception on errors. `arch_type` is the type of 7zip archive passed to the -t 7zip option. Can be None. """ abs_location = os.path.abspath(os.path.expanduser(location)) abs_target_dir = os.path.abspath(os.path.expanduser(target_dir)) entries, errors_msgs = list_entries(location, arch_type) entries = list(entries) # Determine if we need a one-by-one approach: technically the aproach is to # check if we have files that are in the same dir and have the same name # when the case is ignored. We take a simpler approach: we check if all # paths are unique when we ignore the case: for that we only check that the # length of two paths sets are the same: one set as-is and the other # lowercased. paths_as_is = set(e.path for e in entries) paths_no_case = set(p.lower() for p in paths_as_is) need_by_file = len(paths_as_is) != len(paths_no_case) if not need_by_file: # use regular extract return extract_all_files_at_once(location=location, target_dir=target_dir, arch_type=arch_type) # now we are extracting one file at a time. this is a tad painful because we # are dealing with a full command execution at each time. errors = {} warnings = {} tmp_dir = fileutils.get_temp_dir(prefix='extractcode-extract-') for i, entry in enumerate(entries): if not entry.is_file: continue tmp_extract_dir = os.path.join(tmp_dir, str(i)) fileutils.create_dir(tmp_extract_dir) ex_args = build_7z_extract_command( location=location, target_dir=tmp_extract_dir, single_entry=entry, arch_type=arch_type, ) rc, stdout, stderr = command.execute2(**ex_args) error = get_7z_errors(stdout, stderr) if error or rc != 0: error = error or UNKNOWN_ERROR if TRACE: logger.debug( 'extract: failure: {rc}\n' 'stderr: {stderr}\nstdout: {stdout}'.format(**locals())) errors[entry.path] = error continue # these are all for a single file path warns = get_7z_warnings(stdout) or {} wmsg = '\n'.join(warns.values()) if wmsg: if entry.path in warnings: warnings[entry.path] += '\n' + wmsg else: warnings[entry.path] = wmsg # finally move that extracted file to its target location, possibly renamed source_file_name = fileutils.file_name(entry.path) source_file_loc = os.path.join(tmp_extract_dir, source_file_name) if not os.path.exists(source_file_loc): if entry.path in errors: errors[entry.path] += '\nNo file name extracted.' else: errors[entry.path] = 'No file name extracted.' continue safe_path = paths.safe_path(entry.path, posix=True) target_file_loc = os.path.join(target_dir, safe_path) target_file_dir = os.path.dirname(target_file_loc) fileutils.create_dir(target_file_dir) unique_target_file_loc = extractcode.new_name(target_file_loc, is_dir=False) if TRACE: logger.debug( 'extract: unique_target_file_loc: from {} to {}'.format( target_file_loc, unique_target_file_loc)) if os.path.isfile(source_file_loc): fileutils.copyfile(source_file_loc, unique_target_file_loc) else: fileutils.copytree(source_file_loc, unique_target_file_loc) extractcode.remove_backslashes_and_dotdots(abs_target_dir) if errors: raise ExtractErrorFailedToExtract(errors) return convert_warnings_to_list(warnings)