def test_extractcode_command_works_with_relative_paths(monkeypatch): # The setup is a tad complex because we want to have a relative dir # to the base dir where we run tests from, ie the scancode-toolkit/ dir # To use relative paths, we use our tmp dir at the root of the code tree from os.path import dirname, join, abspath from commoncode import fileutils import extractcode import tempfile import shutil try: scancode_root = dirname(dirname(dirname(__file__))) scancode_tmp = join(scancode_root, 'tmp') fileutils.create_dir(scancode_tmp) scancode_root_abs = abspath(scancode_root) test_src_dir = tempfile.mkdtemp(dir=scancode_tmp).replace(scancode_root_abs, '').strip('\\/') test_file = test_env.get_test_loc('extract_relative_path/basic.zip') shutil.copy(test_file, test_src_dir) test_src_file = join(test_src_dir, 'basic.zip') test_tgt_dir = join(scancode_root, test_src_file) + extractcode.EXTRACT_SUFFIX runner = CliRunner() monkeypatch.setattr(click._termui_impl, 'isatty', lambda _: True) result = runner.invoke(extract_cli.extractcode, [test_src_file]) assert result.exit_code == 0 assert 'Extracting done' in result.output assert not 'WARNING' in result.output assert not 'ERROR' in result.output expected = ['/c/a/a.txt', '/c/b/a.txt', '/c/c/a.txt'] file_result = [as_posixpath(f.replace(test_tgt_dir, '')) for f in fileutils.file_iter(test_tgt_dir)] assert sorted(expected) == sorted(file_result) finally: fileutils.delete(test_src_dir)
def extract_file(location, target, kinds=extractcode.default_kinds): """ Extract a single archive at `location` in the `target` directory if it is of a kind supported in the `kinds` kind tuple. """ warnings = [] errors = [] extractor = archive.get_extractor(location, kinds) if TRACE: logger.debug('extract_file: extractor: for: %(location)r with kinds: %(kinds)r : ' % locals() + getattr(extractor, '__module__', '') + '.' + getattr(extractor, '__name__', '')) if extractor: yield ExtractEvent(location, target, done=False, warnings=[], errors=[]) try: # extract first to a temp directory. # if there is an error, the extracted files will not be moved # to target tmp_tgt = fileutils.get_temp_dir('extract') abs_location = abspath(expanduser(location)) warnings.extend(extractor(abs_location, tmp_tgt)) fileutils.copytree(tmp_tgt, target) fileutils.delete(tmp_tgt) except Exception, e: if TRACE: logger.debug('extract_file: ERROR: %(location)r: %(errors)r, %(e)r.\n' % locals()) errors = [str(e).strip(' \'"')] finally:
def test_extractcode_command_works_with_relative_paths_verbose(monkeypatch): # The setup is a tad complex because we want to have a relative dir # to the base dir where we run tests from, ie the scancode-toolkit/ dir # To use relative paths, we use our tmp dir at the root of the code tree from os.path import dirname, join, abspath from commoncode import fileutils import tempfile import shutil try: scancode_root = dirname(dirname(dirname(__file__))) scancode_tmp = join(scancode_root, 'tmp') fileutils.create_dir(scancode_tmp) scancode_root_abs = abspath(scancode_root) test_src_dir = tempfile.mkdtemp(dir=scancode_tmp).replace(scancode_root_abs, '').strip('\\/') test_file = test_env.get_test_loc('extract_relative_path/basic.zip') shutil.copy(test_file, test_src_dir) test_src_file = join(test_src_dir, 'basic.zip') runner = CliRunner() monkeypatch.setattr(click._termui_impl, 'isatty', lambda _: True) result = runner.invoke(extract_cli.extractcode, ['--verbose',test_src_file]) assert result.exit_code == 0 # extract the path from the second line of the output # check that the path is relative and not absolute lines = result.output.splitlines(False) line = lines[1] line_path = line.split(':', 1)[-1].strip() if on_windows: drive = test_file[:2] assert not line_path.startswith(drive) else: assert not line_path.startswith('/') finally: fileutils.delete(test_src_dir)
def extract_with_fallback(location, target_dir, extractor1, extractor2): """ Extract archive at `location` to `target_dir` trying first `extractor1` function. If extract fails, attempt extraction again with the `extractor2` function. Return a list of warning messages. Raise exceptions on errors. Note: there are a few cases where the primary extractor for a type may fail and a secondary extractor will succeed. """ abs_location = os.path.abspath(os.path.expanduser(location)) abs_target_dir = unicode(os.path.abspath(os.path.expanduser(target_dir))) # attempt extract first to a temp dir temp_target1 = unicode(fileutils.get_temp_dir('extract1')) try: warnings = extractor1(abs_location, temp_target1) if TRACE: logger.debug('extract_with_fallback: temp_target1: %(temp_target1)r' % locals()) fileutils.copytree(temp_target1, abs_target_dir) except: try: temp_target2 = unicode(fileutils.get_temp_dir('extract2')) warnings = extractor2(abs_location, temp_target2) if TRACE: logger.debug('extract_with_fallback: temp_target2: %(temp_target2)r' % locals()) fileutils.copytree(temp_target2, abs_target_dir) finally: fileutils.delete(temp_target2) finally: fileutils.delete(temp_target1) return warnings
def create_html_app_assets(results, output_file): """ Given an html-app output_file, create the corresponding `_files` directory and copy the assets to this directory. The target directory is deleted if it exists. Raise HtmlAppAssetCopyWarning if the output_file is <stdout> or HtmlAppAssetCopyError if the copy was not possible. """ try: if is_stdout(output_file): raise HtmlAppAssetCopyWarning() assets_dir = join(get_template_dir('html-app'), 'assets') # delete old assets tgt_dirs = get_html_app_files_dirs(output_file) target_dir = join(*tgt_dirs) if exists(target_dir): fileutils.delete(target_dir) # copy assets fileutils.copytree(assets_dir, target_dir) # write json data import json root_path, assets_dir = get_html_app_files_dirs(output_file) with open(join(root_path, assets_dir, 'data.json'), 'w') as f: f.write('data=' + json.dumps(results)) # create help file with open(join(root_path, assets_dir, 'help.html'), 'w') as f: f.write(get_html_app_help(basename(output_file.name))) except HtmlAppAssetCopyWarning, w: raise w
def test_tree_checksum_is_different_when_file_is_removed(self): test_dir = self.get_test_loc('cache/tree', copy=True) new_file = os.path.join(test_dir, 'some.py') with open(new_file, 'wb') as py: py.write(' ') before = cache.tree_checksum(test_dir) fileutils.delete(new_file) after = cache.tree_checksum(test_dir) assert before != after
def uncompress(location, target_dir, decompressor, suffix=EXTRACT_SUFFIX): """ Uncompress a compressed file at location in the target_dir using the `decompressor` object. The uncompressed file is named after the original archive with a `suffix` added. Return a list of warning messages. Raise Exceptions on errors. """ # FIXME: do not create a sub-directory and instead strip the "compression" # extension such gz, etc. or introspect the archive header to get the file # name when present. if DEBUG: logger.debug('uncompress: ' + location) tmp_loc, warnings = uncompress_file(location, decompressor) target_location = os.path.join(target_dir, os.path.basename(location) + suffix) if os.path.exists(target_location): fileutils.delete(target_location) os.rename(tmp_loc, target_location) return warnings
def test_delete_unwritable_directory_and_files(self): base_dir = self.get_test_loc('fileutils/readwrite', copy=True) test_dir = join(base_dir, 'sub') test_file = join(test_dir, 'file') try: # note: there are no unread/writable dir on windows make_non_readable(test_file) make_non_executable(test_file) make_non_writable(test_file) make_non_readable(test_dir) make_non_executable(test_dir) make_non_writable(test_dir) fileutils.delete(test_dir) assert not os.path.exists(test_dir) finally: fileutils.chmod(base_dir, fileutils.RW, recurse=True)
def create_html_app_assets(output_file): """ Given an html-app output_file, create the corresponding `_files` directory and copy the assets to this directory. The target directory is deleted if it exists. Raise HtmlAppAssetCopyWarning if the output_file is <stdout> or HtmlAppAssetCopyError if the copy was not possible. """ try: assets_dir = join(get_template_dir('html-app'), 'assets') tgt_dirs = get_html_app_files_dirs(output_file) if not tgt_dirs: raise HtmlAppAssetCopyWarning() target_dir = join(*tgt_dirs) if exists(target_dir): fileutils.delete(target_dir) fileutils.copytree(assets_dir, target_dir) except HtmlAppAssetCopyWarning, w: raise w
def create_html_app_assets(output_file): """ Given an html-app output_file, create the corresponding `_files` directory and copy the assets to this directory. The target directory is deleted if it exists. Raise HtmlAppAssetCopyWarning if the output_file is <stdout> or HtmlAppAssetCopyError if the copy was not possible. """ try: assets_dir = join(get_template_dir("html-app"), "assets") tgt_dirs = get_html_app_files_dirs(output_file) if not tgt_dirs: raise HtmlAppAssetCopyWarning() target_dir = join(*tgt_dirs) if exists(target_dir): fileutils.delete(target_dir) fileutils.copytree(assets_dir, target_dir) except HtmlAppAssetCopyWarning, w: raise w
def extract_twice(location, target_dir, extractor1, extractor2): """ Extract a nested compressed archive at `location` to `target_dir` using the `extractor1` function to a temporary directory then the `extractor2` function on the extracted payload of `extractor1`. Return a list of warning messages. Raise exceptions on errors. Typical nested archives include compressed tarballs and RPMs (containing a compressed cpio). Note: it would be easy to support deeper extractor chains, but this gets hard to trace and debug very quickly. A depth of two is simple and sane and covers most common cases. """ if on_linux: location = path_to_bytes(location) target_dir = path_to_bytes(target_dir) abs_location = os.path.abspath(os.path.expanduser(location)) abs_target_dir = unicode(os.path.abspath(os.path.expanduser(target_dir))) # extract first the intermediate payload to a temp dir temp_target = unicode(fileutils.get_temp_dir('extract')) warnings = extractor1(abs_location, temp_target) if TRACE: logger.debug('extract_twice: temp_target: %(temp_target)r' % locals()) # extract this intermediate payload to the final target_dir try: inner_archives = list(fileutils.file_iter(temp_target)) if not inner_archives: warnings.append(location + ': No files found in archive.') else: for extracted1_loc in inner_archives: if TRACE: logger.debug('extract_twice: extractor2: %(extracted1_loc)r' % locals()) warnings.extend(extractor2(extracted1_loc, abs_target_dir)) finally: # cleanup the temporary output from extractor1 fileutils.delete(temp_target) return warnings
def get_gem_metadata(location): """ Return the string content of the metadata of a .gem archive file at `location` or None """ extract_loc = None try: # Extract first level of tar archive extract_loc = fileutils.get_temp_dir(prefix='scancode-extract-') abs_location = abspath(expanduser(location)) warnings = archive.extract_tar(abs_location, extract_loc) or [] if warnings: raise Exception('Failed to extract RubyGem .gem file.\n' + '\n'.join(warnings)) # The gzipped metadata is the second level of archive. metadata = os.path.join(extract_loc, 'metadata') # or it can be a plain, non-gzipped file metadata_gz = metadata + '.gz' if os.path.exists(metadata): with open(metadata, 'rb') as met: content = met.read() elif os.path.exists(metadata_gz): content, warnings = get_gz_compressed_file_content(metadata_gz) if warnings: raise Exception( 'Failed to extract RubyGem .gem/metadata.gz file.\n' + '\n'.join(warnings)) else: raise Exception('No gem metadata found in RubyGem .gem file.') return content finally: if extract_loc: fileutils.delete(extract_loc)
def try_to_extract(location, target_dir, extractor): """ Extract archive at `location` to `target_dir` trying the `extractor` function. If extract fails, just return without returning warnings nor raising exceptions. Note: there are a few cases where we want to attempt extracting something but do not care if this fails. """ abs_location = os.path.abspath(os.path.expanduser(location)) abs_target_dir = unicode(os.path.abspath(os.path.expanduser(target_dir))) temp_target = unicode(fileutils.get_temp_dir('extract1')) warnings = [] try: warnings = extractor(abs_location, temp_target) if TRACE: logger.debug('try_to_extract: temp_target: %(temp_target)r' % locals()) fileutils.copytree(temp_target, abs_target_dir) except: return warnings finally: fileutils.delete(temp_target) return warnings
def _collect_and_parse_tags(self): ctags_args = ['--fields=K', '--c-kinds=fp', '-f', '-', self.sourcefile] ctags_temp_dir = fileutils.get_temp_dir(base_dir='ctags') envt = {'TMPDIR': ctags_temp_dir} try: rc, stdo, err = command.execute2(cmd_loc=self.cmd_loc, ctags_args, env=envt, lib_dir=self.lib_loc, to_files=True) if rc != 0: raise Exception(open(err).read()) with open(stdo, 'rb') as lines: for line in lines: if 'cannot open temporary file' in line: raise Exception('ctags: cannot open temporary file ' ': Permission denied') if line.startswith('!'): continue line = line.strip() if not line: continue splitted = line.split('\t') if (line.endswith('function\tfile:') or line.endswith('prototype\tfile:')): self.local_functions.append(splitted[0]) elif (line.endswith('function') or line.endswith('prototype')): self.global_functions.append(splitted[0]) finally: fileutils.delete(ctags_temp_dir)
def extract_with_fallback(location, target_dir, extractor1, extractor2): """ Extract archive at `location` to `target_dir` trying first the primary `extractor1` function. If extract fails with this function, attempt extraction again with the fallback `extractor2` function. Return a list of warning messages. Raise exceptions on errors. Note: there are a few cases where the primary extractor for a type may fail and a fallback extractor will succeed. """ abs_location = os.path.abspath(os.path.expanduser(location)) abs_target_dir = compat.unicode( os.path.abspath(os.path.expanduser(target_dir))) # attempt extract first to a temp dir temp_target1 = compat.unicode( fileutils.get_temp_dir(prefix='extractcode-extract1-')) try: warnings = extractor1(abs_location, temp_target1) if TRACE: logger.debug( 'extract_with_fallback: temp_target1: %(temp_target1)r' % locals()) fileutils.copytree(temp_target1, abs_target_dir) except: try: temp_target2 = compat.unicode( fileutils.get_temp_dir(prefix='extractcode-extract2-')) warnings = extractor2(abs_location, temp_target2) if TRACE: logger.debug( 'extract_with_fallback: temp_target2: %(temp_target2)r' % locals()) fileutils.copytree(temp_target2, abs_target_dir) finally: fileutils.delete(temp_target2) finally: fileutils.delete(temp_target1) return warnings
def create_html_app_assets(results, output_file): """ Given an html-app output_file, create the corresponding `_files` directory and copy the assets to this directory. The target directory is deleted if it exists. Raise HtmlAppAssetCopyWarning if the output_file is <stdout> or HtmlAppAssetCopyError if the copy was not possible. """ try: if is_stdout(output_file): raise HtmlAppAssetCopyWarning() assets_dir = os.path.join(get_template_dir('html-app'), 'assets') # delete old assets tgt_dirs = get_html_app_files_dirs(output_file) target_dir = os.path.join(*tgt_dirs) if os.path.exists(target_dir): fileutils.delete(target_dir) # copy assets fileutils.copytree(assets_dir, target_dir) # write json data root_path, assets_dir = get_html_app_files_dirs(output_file) with codecs.open(os.path.join(root_path, assets_dir, 'data.json'), 'wb', encoding='utf-8') as f: f.write('data=') json.dump(results, f, iterable_as_array=True) # create help file with codecs.open(os.path.join(root_path, assets_dir, 'help.html'), 'wb', encoding='utf-8') as f: f.write(get_html_app_help(os.path.basename(output_file.name))) except HtmlAppAssetCopyWarning, w: raise w
def test_extractcode_command_works_with_relative_paths(monkeypatch): # The setup is a tad complex because we want to have a relative dir # to the base dir where we run tests from, ie the scancode-toolkit/ dir # To use relative paths, we use our tmp dir at the root of the code tree from os.path import dirname, join, abspath from commoncode import fileutils import extractcode import tempfile import shutil try: scancode_root = dirname(dirname(dirname(__file__))) scancode_tmp = join(scancode_root, 'tmp') fileutils.create_dir(scancode_tmp) scancode_root_abs = abspath(scancode_root) test_src_dir = tempfile.mkdtemp(dir=scancode_tmp).replace( scancode_root_abs, '').strip('\\/') test_file = test_env.get_test_loc('extract_relative_path/basic.zip') shutil.copy(test_file, test_src_dir) test_src_file = join(test_src_dir, 'basic.zip') test_tgt_dir = join(scancode_root, test_src_file) + extractcode.EXTRACT_SUFFIX runner = CliRunner() monkeypatch.setattr(click._termui_impl, 'isatty', lambda _: True) result = runner.invoke(extract_cli.extractcode, [test_src_file]) assert result.exit_code == 0 assert 'Extracting done' in result.output assert not 'WARNING' in result.output assert not 'ERROR' in result.output expected = ['/c/a/a.txt', '/c/b/a.txt', '/c/c/a.txt'] file_result = [ as_posixpath(f.replace(test_tgt_dir, '')) for f in fileutils.resource_iter(test_tgt_dir, with_dirs=False) ] assert sorted(expected) == sorted(file_result) finally: fileutils.delete(test_src_dir)
def test_extractcode_command_works_with_relative_paths_verbose(monkeypatch): # The setup is a tad complex because we want to have a relative dir # to the base dir where we run tests from, ie the scancode-toolkit/ dir # To use relative paths, we use our tmp dir at the root of the code tree from os.path import dirname, join, abspath from commoncode import fileutils import tempfile import shutil try: scancode_root = dirname(dirname(dirname(__file__))) scancode_tmp = join(scancode_root, 'tmp') fileutils.create_dir(scancode_tmp) scancode_root_abs = abspath(scancode_root) test_src_dir = tempfile.mkdtemp(dir=scancode_tmp).replace( scancode_root_abs, '').strip('\\/') test_file = test_env.get_test_loc('extract_relative_path/basic.zip') shutil.copy(test_file, test_src_dir) test_src_file = join(test_src_dir, 'basic.zip') runner = CliRunner() monkeypatch.setattr(click._termui_impl, 'isatty', lambda _: True) result = runner.invoke(extract_cli.extractcode, ['--verbose', test_src_file]) assert result.exit_code == 0 # extract the path from the second line of the output # check that the path is relative and not absolute lines = result.output.splitlines(False) line = lines[1] line_path = line.split(':', 1)[-1].strip() if on_windows: drive = test_file[:2] assert not line_path.startswith(drive) else: assert not line_path.startswith('/') finally: fileutils.delete(test_src_dir)
def cli(license_dir, source, trace, clean, match_text=False, match_approx=False): """ Synchronize ScanCode licenses with an external license source. DIR is the directory to store (or load) external licenses. When using the dejacode source your need to set the 'DEJACODE_API_URL' and 'DEJACODE_API_KEY' environment variables with your credentials. """ global TRACE TRACE = trace if clean: fileutils.delete(license_dir) fileutils.delete(license_dir.rstrip('/\\') + '-new') fileutils.delete(license_dir.rstrip('/\\') + '-update') fileutils.delete(license_dir.rstrip('/\\') + '-del') source_cls = SOURCES[source] source = source_cls(license_dir, match_text, match_approx) synchronize_licenses(source) print()
def clear(self, *args): """ Purge the cache by deleting the corresponding cached data files. """ fileutils.delete(self.cache_base_dir)
def create_html_app(output_file, results, version, scanned_path): # NOQA """ Given an html-app output_file, generate that file, create the data.js data file from the results and create the corresponding `_files` directory and copy the data and assets to this directory. The target directory is deleted if it exists. Raise HtmlAppAssetCopyWarning if the output_file is <stdout> or HtmlAppAssetCopyError if the copy was not possible. """ try: if is_stdout(output_file): raise HtmlAppAssetCopyWarning() source_assets_dir = join(TEMPLATES_DIR, 'html-app', 'assets') # Return a tuple of (parent_dir, dir_name) directory named after the # `output_location` output_locationfile_base_name (stripped from extension) and # a `_files` suffix Return empty strings if output is to stdout. output_location = output_file.name tgt_root_path = dirname(output_location) tgt_assets_dir = file_base_name(output_location) + '_files' # delete old assets target_assets_dir = join(tgt_root_path, tgt_assets_dir) if exists(target_assets_dir): delete(target_assets_dir) # copy assets copytree(source_assets_dir, target_assets_dir) template = get_template(join(TEMPLATES_DIR, 'html-app', 'template.html')) rendered_html = template.render( assets_dir=target_assets_dir, scanned_path=scanned_path, version=version ) output_file.write(rendered_html) # create help file help_template = get_template(join(TEMPLATES_DIR, 'html-app', 'help_template.html')) rendered_help = help_template.render(main_app=output_location) with io.open(join(target_assets_dir, 'help.html'), 'w', encoding='utf-8') as f: f.write(rendered_help) # write json data # FIXME: this should a regular JSON scan format if py2: mode = 'wb' prefix = b'data=' if py3: mode = 'w' prefix = u'data=' with io.open(join(target_assets_dir, 'data.js'), mode) as f: f.write(prefix) simplejson.dump(results, f, iterable_as_array=True) except HtmlAppAssetCopyWarning as w: raise w except Exception as e: # NOQA import traceback msg = 'ERROR: cannot create HTML application.\n' + traceback.format_exc() raise HtmlAppAssetCopyError(msg)
def rebuild_rootfs(img, target_dir): """ Extract and merge or "squash" all layers of the `image` Image in a single rootfs in `target_dir`. Extraction is done in sequence from the bottom (root or initial) layer to the top (or latest) layer and the "whiteouts" unionfs/overlayfs procedure is applied at each step as per the OCI spec: https://github.com/opencontainers/image-spec/blob/master/layer.md#whiteouts Return a list of deleted "whiteout" files. Raise an Exception on errrors. The extraction process consists of these steps: - extract the layer in a temp directory - find whiteouts in that layer temp dir - remove files/directories corresponding to these whiteouts in the target directory - remove whiteouts special marker files or dirs in the tempdirectory - move layer to the target directory, overwriting existing files See also some related implementations and links: https://github.com/moby/moby/blob/d1f470946/pkg/archive/whiteouts.go https://github.com/virt-manager/virt-bootstrap/blob/8a7e752d/src/virtBootstrap/whiteout.py https://github.com/goldmann/docker-squash https://github.com/moby/moby/blob/master/image/spec/v1.md https://github.com/moby/moby/blob/master/image/spec/v1.1.md https://github.com/moby/moby/blob/master/image/spec/v1.2.md """ assert os.path.isdir(target_dir) # log deletions deletions = [] for layer_num, layer in enumerate(img.layers): if TRACE: logger.debug(f'Extracting layer {layer_num} - {layer.layer_id} ' f'tarball: {layer.archive_location}') # 1. extract a layer to temp. # Note that we are not preserving any special file and any file permission extracted_loc = tempfile.mkdtemp('container_inspector-docker') layer.extract(extracted_location=extracted_loc) if TRACE: logger.debug(f' Extracted layer to: {extracted_loc}') # 2. find whiteouts in that layer. whiteouts = list(find_whiteouts(extracted_loc)) if TRACE: logger.debug( ' Merging extracted layers and applying unionfs whiteouts') if TRACE: logger.debug(' Whiteouts:\n' + ' \n'.join(map(repr, whiteouts))) # 3. remove whiteouts in the previous layer stack (e.g. the WIP rootfs) for whiteout_marker_loc, whiteable_path in whiteouts: if TRACE: logger.debug( f' Deleting dir or file with whiteout marker: {whiteout_marker_loc}' ) whiteable_loc = os.path.join(target_dir, whiteable_path) delete(whiteable_loc) # also delete the whiteout marker file delete(whiteout_marker_loc) deletions.append(whiteable_loc) # 4. finall copy/overwrite the extracted layer over the WIP rootfs if TRACE: logger.debug( f' Moving extracted layer from: {extracted_loc} to: {target_dir}' ) copytree(extracted_loc, target_dir) if TRACE: logger.debug(f' Moved layer to: {target_dir}') delete(extracted_loc) return deletions
def test_get_or_build_index_through_cache(self): # note: this is a rather complex test because caching involves some globals license_index_cache_dir = self.get_temp_dir('index_cache') _index_lock_file = os.path.join(license_index_cache_dir, 'lockfile') _tree_checksum_file = os.path.join(license_index_cache_dir, 'tree_checksums') _index_cache_file = os.path.join(license_index_cache_dir, 'index_cache') _tree_base_dir = self.get_temp_dir('src_dir') _licenses_dir = self.get_test_loc('cache/data', copy=True) _licenses_data_dir = os.path.join(_licenses_dir, 'licenses') _rules_data_dir = os.path.join(_licenses_dir, 'rules') _timeout = 10 assert not os.path.exists(_tree_checksum_file) assert not os.path.exists(_index_cache_file) assert not os.path.exists(_index_lock_file) check_consistency = True return_index = False # when a new index is built, new index files are created cache.get_or_build_index_through_cache( check_consistency, return_index, _tree_base_dir, _tree_checksum_file, _index_lock_file, _index_cache_file, _licenses_data_dir, _rules_data_dir, _timeout) assert os.path.exists(_tree_checksum_file) assert os.path.exists(_index_cache_file) assert not os.path.exists(_index_lock_file) # when nothing changed a new index files is not created tree_before = open(_tree_checksum_file).read() idx_checksum_before = hash.sha1(_index_cache_file) idx_date_before = date.get_file_mtime(_index_cache_file) cache.get_or_build_index_through_cache( check_consistency, return_index, _tree_base_dir, _tree_checksum_file, _index_lock_file, _index_cache_file, _licenses_data_dir, _rules_data_dir, _timeout) assert tree_before == open(_tree_checksum_file).read() assert idx_checksum_before == hash.sha1(_index_cache_file) assert idx_date_before == date.get_file_mtime(_index_cache_file) # now add some file in the source tree new_file = os.path.join(_tree_base_dir, 'some file') with open(new_file, 'wb') as nf: nf.write('somthing') # when check_consistency is False, the index is not rebuild when # new files are added check_consistency = False cache.get_or_build_index_through_cache( check_consistency, return_index, _tree_base_dir, _tree_checksum_file, _index_lock_file, _index_cache_file, _licenses_data_dir, _rules_data_dir, _timeout) assert tree_before == open(_tree_checksum_file).read() assert idx_checksum_before == hash.sha1(_index_cache_file) assert idx_date_before == date.get_file_mtime(_index_cache_file) # when check_consistency is True, the index is rebuilt when new # files are added check_consistency = True cache.get_or_build_index_through_cache( check_consistency, return_index, _tree_base_dir, _tree_checksum_file, _index_lock_file, _index_cache_file, _licenses_data_dir, _rules_data_dir, _timeout) assert tree_before != open(_tree_checksum_file).read() assert idx_date_before != date.get_file_mtime(_index_cache_file) # now add some ignored file in the source tree tree_before = open(_tree_checksum_file).read() idx_checksum_before = hash.sha1(_index_cache_file) idx_date_before = date.get_file_mtime(_index_cache_file) new_file = os.path.join(_tree_base_dir, 'some file.pyc') with open(new_file, 'wb') as nf: nf.write('somthing') check_consistency = True cache.get_or_build_index_through_cache( check_consistency, return_index, _tree_base_dir, _tree_checksum_file, _index_lock_file, _index_cache_file, _licenses_data_dir, _rules_data_dir, _timeout) assert tree_before == open(_tree_checksum_file).read() assert idx_checksum_before == hash.sha1(_index_cache_file) assert idx_date_before == date.get_file_mtime(_index_cache_file) # if the treechecksum file dies the index is rebuilt fileutils.delete(_tree_checksum_file) idx_checksum_before = hash.sha1(_index_cache_file) check_consistency = False cache.get_or_build_index_through_cache( check_consistency, return_index, _tree_base_dir, _tree_checksum_file, _index_lock_file, _index_cache_file, _licenses_data_dir, _rules_data_dir, _timeout) assert tree_before == open(_tree_checksum_file).read() assert idx_date_before != date.get_file_mtime(_index_cache_file) # if the index cache file dies the index is rebuilt fileutils.delete(_index_cache_file) check_consistency = False cache.get_or_build_index_through_cache( check_consistency, return_index, _tree_base_dir, _tree_checksum_file, _index_lock_file, _index_cache_file, _licenses_data_dir, _rules_data_dir, _timeout) assert tree_before == open(_tree_checksum_file).read() assert os.path.exists(_index_cache_file)
def test_build_index(self): # note: this is a rather complex test because caching involves some globals cache_dir = self.get_temp_dir('index_cache') lock_file, checksum_file, cache_file = get_license_cache_paths(cache_dir=cache_dir) tree_base_dir = self.get_temp_dir('src_dir') licenses_data_dir = self.get_test_loc('cache/data/licenses', copy=True) rules_data_dir = self.get_test_loc('cache/data/rules', copy=True) # now add some file in the mock source tree new_file = os.path.join(tree_base_dir, 'some.py') with open(new_file, 'wb') as nf: nf.write('somthing') timeout = 10 assert not os.path.exists(checksum_file) assert not os.path.exists(cache_file) assert not os.path.exists(lock_file) # when a new index is built, new index files are created check_consistency = True cache.get_cached_index(cache_dir, check_consistency, timeout, tree_base_dir, licenses_data_dir, rules_data_dir) assert os.path.exists(checksum_file) assert os.path.exists(cache_file) assert not os.path.exists(lock_file) # when nothing changed a new index files is not created tree_before = open(checksum_file).read() idx_checksum_before = hash.sha1(cache_file) cache.get_cached_index(cache_dir, check_consistency, timeout, tree_base_dir, licenses_data_dir, rules_data_dir) assert tree_before == open(checksum_file).read() assert idx_checksum_before == hash.sha1(cache_file) # now add some file in the source tree new_file = os.path.join(tree_base_dir, 'some file') with open(new_file, 'wb') as nf: nf.write('somthing') # when check_consistency is False, the index is not rebuild when # new files are added check_consistency = False cache.get_cached_index(cache_dir, check_consistency, timeout, tree_base_dir, licenses_data_dir, rules_data_dir) assert tree_before == open(checksum_file).read() assert idx_checksum_before == hash.sha1(cache_file) # when check_consistency is True, the index is rebuilt when new # files are added check_consistency = True cache.get_cached_index(cache_dir, check_consistency, timeout, tree_base_dir, licenses_data_dir, rules_data_dir) assert tree_before != open(checksum_file).read() # now add some ignored file in the source tree tree_before = open(checksum_file).read() idx_checksum_before = hash.sha1(cache_file) new_file = os.path.join(tree_base_dir, 'some file.pyc') with open(new_file, 'wb') as nf: nf.write('somthing') # when check_consistency is True, the index is not rebuilt when new # files are added that are ignored check_consistency = True cache.get_cached_index(cache_dir, check_consistency, timeout, tree_base_dir, licenses_data_dir, rules_data_dir) assert tree_before == open(checksum_file).read() assert idx_checksum_before == hash.sha1(cache_file) # if the treechecksum file dies, the index is rebuilt fileutils.delete(checksum_file) idx_checksum_before = hash.sha1(cache_file) check_consistency = False cache.get_cached_index(cache_dir, check_consistency, timeout, tree_base_dir, licenses_data_dir, rules_data_dir) assert tree_before == open(checksum_file).read() # if the index cache file dies the index is rebuilt fileutils.delete(cache_file) check_consistency = False idx1 = cache.get_cached_index(cache_dir, check_consistency, timeout, tree_base_dir, licenses_data_dir, rules_data_dir) # load index, forced from file idx2 = cache.load_index(cache_file) assert idx1.to_dict(True) == idx2.to_dict(True) # reset global caches cache._LICENSE_SYMBOLS_BY_SPDX_KEY = {} cache._LICENSES_BY_KEY_INDEX = None cache._UNKNOWN_SPDX_SYMBOL = None cache._LICENSES_BY_KEY = None
def rebuild_rootfs(image, target_dir, layerid_len=DEFAULT_ID_LEN): """ Extract and merge all layers to target_dir. Extraction is done in sequence from bottom (root) to top (latest layer). Return a mapping of errors and a list of whiteouts/deleted files. The extraction process consists of these steps: - extract the layer in a temp directory - move layer to the target directory, overwriting existing files - if any, remove AUFS special files/dirs in the target directory - if any, remove whiteouts file/directory pairs in the target directory """ from extractcode.extract import extract_file assert filetype.is_dir(target_dir) assert os.path.exists(target_dir) extract_errors = [] # log whiteouts deletions whiteouts = [] for layer_id, layer in image.layers.items(): layer_tarball = join(image.repo_dir, layer_id[:layerid_len], LAYER_TAR_FILE) logger.debug('Extracting layer tarball: %(layer_tarball)r' % locals()) temp_target = fileutils.get_temp_dir('conan-docker') xevents = list(extract_file(layer_tarball, temp_target)) for x in xevents: if x.warnings or x.errors: extract_errors.extend(xevents) # FIXME: the order of ops is WRONG: we are getting whiteouts incorrectly # it should be: # 1. extract a layer to temp. # 2. find whiteouts in that layer. # 3. remove whiteouts in the previous layer stack (e.g. the WIP rootfs) # 4. finall copy the extracted layer over the WIP rootfs # move extracted layer to target_dir logger.debug( 'Moving extracted layer from: %(temp_target)r to: %(target_dir)r') fileutils.copytree(temp_target, target_dir) fileutils.delete(temp_target) logger.debug( 'Merging extracted layers and applying AUFS whiteouts/deletes') for top, dirs, files in fileutils.walk(target_dir): # delete AUFS dirs and apply whiteout deletions for dr in dirs[:]: whiteable_dir = join(top, dr) if dr.startswith(WHITEOUT_PREFIX): # delete the .wh. dir... dirs.remove(dr) logger.debug('Deleting whiteout dir: %(whiteable_dir)r' % locals()) fileutils.delete(whiteable_dir) # ... and delete the corresponding dir it does "whiteout" base_dir = dr[len(WHITEOUT_PREFIX):] try: dirs.remove(base_dir) except ValueError: # FIXME: should we really raise an exception here? msg = ('Inconsistent layers: ' 'missing directory to whiteout: %(base_dir)r' % locals()) raise InconsistentLayersError(msg) wdo = join(top, base_dir) logger.debug('Deleting real dir: %(wdo)r' % locals()) fileutils.delete(wdo) whiteouts.append(wdo) # delete AUFS special dirs elif dr.startswith(WHITEOUT_SPECIAL_DIR_PREFIX): dirs.remove(dr) logger.debug( 'Deleting AUFS special dir: %(whiteable_dir)r' % locals()) fileutils.delete(whiteable_dir) # delete AUFS files and apply whiteout deletions all_files = set(files) for fl in all_files: whiteable_file = join(top, fl) if fl.startswith(WHITEOUT_PREFIX): # delete the .wh. marker file... logger.debug('Deleting whiteout file: %(whiteable_file)r' % locals()) fileutils.delete(whiteable_file) # ... and delete the corresponding file it does "whiteout" # e.g. logically delete base_file = fl[len(WHITEOUT_PREFIX):] wfo = join(top, base_file) whiteouts.append(wfo) if base_file in all_files: logger.debug('Deleting real file: %(wfo)r' % locals()) fileutils.delete(wfo) # delete AUFS special files elif fl.startswith(WHITEOUT_SPECIAL_DIR_PREFIX): logger.debug( 'Deleting AUFS special file: %(whiteable_file)r' % locals()) fileutils.delete(whiteable_file) whiteouts.append(whiteable_file) return extract_errors, whiteouts
def test_LicenseCache_load_or_build(self): # recreate internal paths for testing licensedcode_cache_dir = self.get_temp_dir('index_cache') scancode_cache_dir = self.get_temp_dir('index_metafiles') idx_cache_dir = os.path.join(licensedcode_cache_dir, cache.LICENSE_INDEX_DIR) fileutils.create_dir(idx_cache_dir) cache_file = os.path.join(idx_cache_dir, cache.LICENSE_INDEX_FILENAME) lock_file = os.path.join(scancode_cache_dir, cache.LICENSE_LOCKFILE_NAME) checksum_file = os.path.join(scancode_cache_dir, cache.LICENSE_CHECKSUM_FILE) tree_base_dir = self.get_temp_dir('src_dir') licenses_data_dir = self.get_test_loc('cache/data/licenses', copy=True) rules_data_dir = self.get_test_loc('cache/data/rules', copy=True) # now add some file in the mock source tree new_file = os.path.join(tree_base_dir, 'some.py') with open(new_file, 'w') as nf: nf.write('somthing') assert not os.path.exists(checksum_file) assert not os.path.exists(cache_file) assert not os.path.exists(lock_file) timeout = 10 # when a new cache is built, new cache files are created check_consistency = True _cached1 = cache.LicenseCache.load_or_build( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, check_consistency=check_consistency, timeout=timeout, tree_base_dir=tree_base_dir, licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir, ) assert os.path.exists(checksum_file) assert os.path.exists(cache_file) # when nothing changed a new index files is not created tree_before = open(checksum_file).read() idx_checksum_before = hash.sha1(cache_file) _cached2 = cache.LicenseCache.load_or_build( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, check_consistency=check_consistency, timeout=timeout, tree_base_dir=tree_base_dir, licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir, ) assert open(checksum_file).read() == tree_before assert hash.sha1(cache_file) == idx_checksum_before # now add some file in the source tree new_file = os.path.join(tree_base_dir, 'some file') with open(new_file, 'w') as nf: nf.write('somthing') # when check_consistency is False, the index is not rebuild when # new files are added check_consistency = False _cached3 = cache.LicenseCache.load_or_build( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, check_consistency=check_consistency, timeout=timeout, tree_base_dir=tree_base_dir, licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir, ) assert open(checksum_file).read() == tree_before assert hash.sha1(cache_file) == idx_checksum_before # when check_consistency is True, the index is rebuilt when new # files are added check_consistency = True _cached4 = cache.LicenseCache.load_or_build( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, check_consistency=check_consistency, timeout=timeout, tree_base_dir=tree_base_dir, licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir, ) assert open(checksum_file).read() != tree_before # now add some ignored file in the source tree tree_before = open(checksum_file).read() idx_checksum_before = hash.sha1(cache_file) new_file = os.path.join(tree_base_dir, 'some file.pyc') with open(new_file, 'w') as nf: nf.write('somthing') # when check_consistency is True, the index is not rebuilt when new # files are added that are ignored check_consistency = True _cached5 = cache.LicenseCache.load_or_build( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, check_consistency=check_consistency, timeout=timeout, tree_base_dir=tree_base_dir, licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir, ) assert open(checksum_file).read() == tree_before assert hash.sha1(cache_file) == idx_checksum_before # if the treechecksum file dies, the index is not rebuilt if # check_consistency is False. and no new checksum is created fileutils.delete(checksum_file) idx_checksum_before = hash.sha1(cache_file) check_consistency = False _cached6 = cache.LicenseCache.load_or_build( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, check_consistency=check_consistency, timeout=timeout, tree_base_dir=tree_base_dir, licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir, ) assert not os.path.exists(checksum_file) # with the treechecksum file gone, the index is rebuilt if # check_consistency is True and a new checksum is created idx_checksum_before = hash.sha1(cache_file) check_consistency = True _cached7 = cache.LicenseCache.load_or_build( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, check_consistency=check_consistency, timeout=timeout, tree_base_dir=tree_base_dir, licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir, ) assert open(checksum_file).read() == tree_before # if the index cache file dies the index is rebuilt fileutils.delete(cache_file) check_consistency = False cached8 = cache.LicenseCache.load_or_build( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, check_consistency=check_consistency, timeout=timeout, tree_base_dir=tree_base_dir, licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir, ) idx1 = cached8.index # load index, forced from file cached9 = cache.load_cache_file(cache_file) idx2 = cached9.index assert set(idx2.dictionary.keys()) == set(idx1.dictionary.keys())
def test_LicenseCache_load_or_build_from_empty(self): # recreate internal paths for testing licensedcode_cache_dir = self.get_temp_dir('index_cache') scancode_cache_dir = self.get_temp_dir('index_metafiles') idx_cache_dir = os.path.join(licensedcode_cache_dir, cache.LICENSE_INDEX_DIR) fileutils.create_dir(idx_cache_dir) cache_file = os.path.join(idx_cache_dir, cache.LICENSE_INDEX_FILENAME) lock_file = os.path.join(scancode_cache_dir, cache.LICENSE_LOCKFILE_NAME) licenses_data_dir = self.get_test_loc('cache/data/licenses', copy=True) rules_data_dir = self.get_test_loc('cache/data/rules', copy=True) assert not os.path.exists(cache_file) assert not os.path.exists(lock_file) timeout = 10 # when a new cache is built, new cache files are created _cached1 = cache.LicenseCache.load_or_build( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, force=False, timeout=timeout, licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir, ) assert os.path.exists(cache_file) fileutils.delete(cache_file) # force=True builds an index too if none exists _cached2 = cache.LicenseCache.load_or_build( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, force=True, timeout=timeout, licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir, ) assert os.path.exists(cache_file) # force=True rebuilds an index idx_checksum_before = hash.sha1(cache_file) _cached3 = cache.LicenseCache.load_or_build( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, force=True, timeout=timeout, licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir, ) assert hash.sha1(cache_file) != idx_checksum_before # force=False loads an index idx_checksum_before = hash.sha1(cache_file) _cached4 = cache.LicenseCache.load_or_build( licensedcode_cache_dir=licensedcode_cache_dir, scancode_cache_dir=scancode_cache_dir, force=False, timeout=timeout, licenses_data_dir=licenses_data_dir, rules_data_dir=rules_data_dir, ) assert hash.sha1(cache_file) == idx_checksum_before