def remove_vcs(self, test_dir): """ Remove some version control directories and some temp editor files. """ vcses = ('CVS', '.svn', '.git', '.hg') if on_linux and py2: vcses = tuple(fsencode(p) for p in vcses) test_dir = fsencode(test_dir) for root, dirs, files in os.walk(test_dir): for vcs_dir in vcses: if vcs_dir in dirs: for vcsroot, vcsdirs, vcsfiles in os.walk(test_dir): for vcsfile in vcsdirs + vcsfiles: vfile = path.join(vcsroot, vcsfile) fileutils.chmod(vfile, fileutils.RW, recurse=False) shutil.rmtree(path.join(root, vcs_dir), False) # editors temp file leftovers tilde = b'~' if on_linux and py2 else '~' tilde_files = [ path.join(root, file_loc) for file_loc in files if file_loc.endswith(tilde) ] for tf in tilde_files: os.remove(tf)
def get_test_loc(test_path, test_data_dir, debug=False, exists=True): """ Given a `test_path` relative to the `test_data_dir` directory, return the location to a test file or directory for this path. No copy is done. """ if on_linux and py2: test_path = fsencode(test_path) test_data_dir = fsencode(test_data_dir) if debug: import inspect caller = inspect.stack()[1][3] print('\nget_test_loc,%(caller)s,"%(test_path)s","%(test_data_dir)s"' % locals()) assert test_path assert test_data_dir if not path.exists(test_data_dir): raise IOError("[Errno 2] No such directory: test_data_dir not found:" " '%(test_data_dir)s'" % locals()) tpath = to_os_native_path(test_path) test_loc = path.abspath(path.join(test_data_dir, tpath)) if exists and not path.exists(test_loc): raise IOError("[Errno 2] No such file or directory: " "test_path not found: '%(test_loc)s'" % locals()) return test_loc
def check_scan_does_not_fail_when_scanning_unicode_files_and_paths(verbosity): test_dir = test_env.get_test_loc(u'unicodepath/uc') result_file = test_env.get_temp_file('json') if on_linux: test_dir = fsencode(test_dir) result_file = fsencode(result_file) args = [ '--info', '--license', '--copyright', '--package', '--email', '--url', '--strip-root', test_dir, '--json', result_file ] + ([verbosity] if verbosity else []) results = run_scan_click(args) # the paths for each OS end up encoded differently. # See for details: # https://github.com/nexB/scancode-toolkit/issues/390 # https://github.com/nexB/scancode-toolkit/issues/688 if on_linux: expected = 'unicodepath/unicodepath.expected-linux.json' + verbosity elif on_mac: expected = 'unicodepath/unicodepath.expected-mac.json' + verbosity elif on_windows: expected = 'unicodepath/unicodepath.expected-win.json' + verbosity check_json_scan(test_env.get_test_loc(expected), result_file, remove_file_date=True, regen=False) return results
def get_test_loc(self, test_path, copy=False, debug=False): """ Given a `test_path` relative to the self.test_data_dir directory, return the location to a test file or directory for this path. Copy to a temp test location if `copy` is True. """ test_data_dir = self.test_data_dir if on_linux and py2: test_path = fsencode(test_path) test_data_dir = fsencode(test_data_dir) if debug: import inspect caller = inspect.stack()[1][3] print('\nself.get_test_loc,%(caller)s,"%(test_path)s"' % locals()) test_loc = get_test_loc(test_path, test_data_dir, debug=debug) if copy: base_name = path.basename(test_loc) if filetype.is_file(test_loc): # target must be an existing dir target_dir = self.get_temp_dir() fileutils.copyfile(test_loc, target_dir) test_loc = path.join(target_dir, base_name) else: # target must be a NON existing dir target_dir = path.join(self.get_temp_dir(), base_name) fileutils.copytree(test_loc, target_dir) # cleanup of VCS that could be left over from checkouts self.remove_vcs(target_dir) test_loc = target_dir return test_loc
def test_scan_can_handle_non_utf8_file_names_on_posix(): test_dir = test_env.extract_test_tar_raw('non_utf8/non_unicode.tgz') result_file = test_env.get_temp_file('json') if on_linux and py2: test_dir = fsencode(test_dir) result_file = fsencode(result_file) args = ['-i', '--strip-root', test_dir, '--json', result_file] run_scan_click(args) # the paths for each OS end up encoded differently. # See for details: # https://github.com/nexB/scancode-toolkit/issues/390 # https://github.com/nexB/scancode-toolkit/issues/688 if on_linux: expected = 'non_utf8/expected-linux.json' elif on_mac: expected = 'non_utf8/expected-mac.json' elif on_windows and py2: expected = 'non_utf8/expected-win-py2.json' elif on_windows and py3: expected = 'non_utf8/expected-win-py3.json' check_json_scan(test_env.get_test_loc(expected), result_file, regen=False)
def extract_tar(location, target_dir, verbatim=False, *args, **kwargs): """ Extract a tar archive at location in the target_dir directory. If `verbatim` is True preserve the permissions. """ # always for using bytes for paths on all OSses... tar seems to use bytes internally # and get confused otherwise location = fsencode(location) if on_linux and py2: target_dir = fsencode(target_dir) with open(location, 'rb') as input_tar: tar = None try: tar = tarfile.open(fileobj=input_tar) tarinfos = tar.getmembers() to_extract = [] for tarinfo in tarinfos: if tar_can_extract(tarinfo, verbatim): if not verbatim: tarinfo.mode = 0o755 to_extract.append(tarinfo) tar.extractall(target_dir, members=to_extract) finally: if tar: tar.close()
def test_fsdecode_and_fsencode_are_idempotent(self): a = b'foo\xb1bar' b = u'foo\udcb1bar' assert a == fsencode(fsdecode(a)) assert a == fsencode(fsdecode(b)) assert b == fsdecode(fsencode(a)) assert b == fsdecode(fsencode(b))
def get_handlers(location): """ Return an iterable of (handler, type_matched, mime_matched, extension_matched,) for this `location`. """ if on_linux and py2: location = fileutils.fsencode(location) if filetype.is_file(location): T = contenttype.get_type(location) ftype = T.filetype_file.lower() mtype = T.mimetype_file if TRACE_DEEP: logger.debug( 'get_handlers: processing %(location)s: ftype: %(ftype)s, mtype: %(mtype)s ' % locals()) for handler in archive_handlers: if not handler.extractors: continue extractor_count = len(handler.extractors) if extractor_count > 2: raise Exception('Maximum level of archive nesting is two.') # default to False type_matched = handler.filetypes and any( t in ftype for t in handler.filetypes) mime_matched = handler.mimetypes and any( m in mtype for m in handler.mimetypes) exts = handler.extensions if exts: if on_linux and py2: exts = tuple(fileutils.fsencode(e) for e in exts) extension_matched = exts and location.lower().endswith(exts) if TRACE_DEEP: logger.debug( ' get_handlers: matched type: %(type_matched)s, mime: %(mime_matched)s, ext: %(extension_matched)s' % locals()) if handler.strict and not all( [type_matched, mime_matched, extension_matched]): logger.debug(' get_handlers: skip strict' % locals()) continue if type_matched or mime_matched or extension_matched: if TRACE_DEEP: handler_name = handler.name logger.debug( ' get_handlers: yielding handler: %(handler_name)r' % locals()) yield handler, type_matched, mime_matched, extension_matched
def _extract_tar_raw(test_path, target_dir, to_bytes, *args, **kwargs): """ Raw simplified extract for certain really weird paths and file names. """ if to_bytes and py2: # use bytes for paths on ALL OSes (though this may fail on macOS) target_dir = fsencode(target_dir) test_path = fsencode(test_path) tar = tarfile.open(test_path) tar.extractall(path=target_dir) tar.close()
def extract_zip_raw(location, target_dir, *args, **kwargs): """ Extract a zip archive file at location in the target_dir directory. Use the builtin extractall function """ if not path.isfile(location) and zipfile.is_zipfile(location): raise Exception('Incorrect zip file %(location)r' % locals()) if on_linux and py2: location = fsencode(location) target_dir = fsencode(target_dir) with zipfile.ZipFile(location) as zipf: zipf.extractall(path=target_dir)
def update_path_var(existing_path_var, new_path, pathsep=PATH_ENV_SEP): """ Return an updated value for the `existing_path_var` PATH-like environment variable value by adding `new_path` to the front of that variable if `new_path` is not already part of this PATH-like variable. """ if not new_path: return existing_path_var existing_path_var = existing_path_var or EMPTY_STRING # ensure we use unicode or bytes depending on OSes # TODO: deal also with Python versions if on_linux and py2: # bytes ... existing_path_var = fsencode(existing_path_var) new_path = fsencode(new_path) pathsep = fsencode(pathsep) else: # ... and unicode otherwise existing_path_var = fsdecode(existing_path_var) new_path = fsdecode(new_path) pathsep = fsdecode(pathsep) path_elements = existing_path_var.split(pathsep) if not path_elements: updated_path_var = new_path elif new_path not in path_elements: # add new path to the front of the PATH env var path_elements.insert(0, new_path) updated_path_var = pathsep.join(path_elements) else: # new path is already in PATH, change nothing updated_path_var = existing_path_var if py2: # always use bytes for env vars... if isinstance(updated_path_var, compat.unicode): updated_path_var = fsencode(updated_path_var) else: # ... else use unicode if not isinstance(updated_path_var, compat.unicode): updated_path_var = fsdecode(updated_path_var) # at this stage new_path_env is unicode on all OSes on Py3 # and on Py2 it is bytes on Linux and unicode elsewhere return updated_path_var
def get_temp_dir(self, sub_dir_path=None): """ Create a unique new temporary directory location. Create directories identified by sub_dir_path if provided in this temporary directory. Return the location for this unique directory joined with the sub_dir_path if any. """ # ensure that we have a new unique temp directory for each test run global test_run_temp_dir if not test_run_temp_dir: from scancode_config import scancode_root_dir test_tmp_root_dir = path.join(scancode_root_dir, 'tmp') # now we add a space in the path for testing path with spaces test_run_temp_dir = fileutils.get_temp_dir( base_dir=test_tmp_root_dir, prefix='scancode-tk-tests -') if on_linux and py2: test_run_temp_dir = fsencode(test_run_temp_dir) test_run_temp_subdir = fileutils.get_temp_dir( base_dir=test_run_temp_dir, prefix='') if sub_dir_path: # create a sub directory hierarchy if requested sub_dir_path = to_os_native_path(sub_dir_path) test_run_temp_subdir = path.join(test_run_temp_subdir, sub_dir_path) fileutils.create_dir(test_run_temp_subdir) return test_run_temp_subdir
def test_extractcode_command_can_extract_archive_with_unicode_names( monkeypatch): monkeypatch.setattr(click._termui_impl, 'isatty', lambda _: True) test_dir = test_env.get_test_loc('unicodearch', copy=True) if on_linux: test_dir = fsencode(test_dir) runner = CliRunner() result = runner.invoke(extract_cli.extractcode, [test_dir]) assert result.exit_code == 0 uni_arch = b'unicodepath.tgz' if on_linux and py2 else 'unicodepath.tgz' uni_path = b'/unicodepath/' if on_linux and py2 else '/unicodepath/' file_result = [ f for f in map(as_posixpath, resource_iter(test_dir, with_dirs=False)) if not f.endswith(uni_arch) ] file_result = [ EMPTY_STRING.join(f.partition(uni_path)[1:]) for f in file_result ] file_result = [f for f in file_result if f] expected = [ '/unicodepath/Ho_', '/unicodepath/Ho_a', '/unicodepath/koristenjem_Karkkainen_-_Sander.pdf' ] assert sorted(expected) == sorted(file_result)
def remove_archive_suffix(path): """ Remove all the extracted suffix from a path. """ if on_linux: path = fsencode(path) return re.sub(EXTRACT_SUFFIX, EMPTY_STRING, path)
def remove_backslashes_and_dotdots(directory): """ Walk a directory and rename the files if their names contain backslashes. Return a list of errors if any. """ if on_linux: directory = fsencode(directory) errors = [] for top, _, files in os.walk(directory): for filename in files: if not (WIN_PATH_SEP in filename or DOTDOT in filename): continue try: new_path = as_posixpath(filename) new_path = new_path.strip(POSIX_PATH_SEP) new_path = posixpath.normpath(new_path) new_path = new_path.replace(DOTDOT, POSIX_PATH_SEP) new_path = new_path.strip(POSIX_PATH_SEP) new_path = posixpath.normpath(new_path) segments = new_path.split(POSIX_PATH_SEP) directory = join(top, *segments[:-1]) create_dir(directory) shutil.move(join(top, filename), join(top, *segments)) except Exception: errors.append(join(top, filename)) return errors
def test_extractcode_command_can_ignore(monkeypatch): monkeypatch.setattr(click._termui_impl, 'isatty', lambda _: True) test_dir = test_env.get_test_loc('extract_ignore', copy=True) if on_linux: test_dir = fsencode(test_dir) runner = CliRunner() result = runner.invoke(extract_cli.extractcode, ['--ignore', '*.tar', test_dir]) assert result.exit_code == 0 file_result = [ f for f in map(as_posixpath, resource_iter(test_dir, with_dirs=False)) if not f.endswith('a.tar') or not f.endswith('b.tar') ] file_result = [ EMPTY_STRING.join(f.partition('/a.zip-extract/')[1:]) for f in file_result ] file_result = [f for f in file_result if f] expected = [ '/a.zip-extract/a.txt', '/a.zip-extract/b.zip', '/a.zip-extract/b.zip-extract/b.txt', '/a.zip-extract/c.tar', ] assert sorted(expected) == sorted(file_result)
def is_data(location, definitions=DATA_TYPE_DEFINITIONS): """ Return True isthe file at `location` is a data file. """ if on_linux and py2: location = fileutils.fsencode(location) if not filetype.is_file(location): return False T = get_type(location) ftype = T.filetype_file.lower() mtype = T.mimetype_file.lower() for ddef in definitions: type_matched = ddef.filetypes and any(t in ftype for t in ddef.filetypes) mime_matched = ddef.mimetypes and any(m in mtype for m in ddef.mimetypes) exts = ddef.extensions if exts: extension_matched = exts and location.lower().endswith(exts) if TRACE: logger_debug('is_data: considering def: %(ddef)r for %(location)s' % locals()) logger_debug('matched type: %(type_matched)s, mime: %(mime_matched)s, ext: %(extension_matched)s' % locals()) if ddef.strict and not all([type_matched, mime_matched, extension_matched]): continue if type_matched or mime_matched or extension_matched: if TRACE: logger_debug('is_data: True: %(location)s: ' % locals()) return True return False
def get_extraction_path(path): """ Return a path where to extract. """ if on_linux: path = fsencode(path) return path.rstrip(PATHS_SEPS) + EXTRACT_SUFFIX
def is_extracted(location): """ Return True is the location is already extracted to the corresponding extraction location. """ if on_linux: location = fsencode(location) return location and exists(get_extraction_path(location))
def is_extraction_path(path): """ Return True is the path points to an extraction path. """ if on_linux: path = fsencode(path) return path and path.rstrip(PATHS_SEPS).endswith(EXTRACT_SUFFIX)
def update_path_environment(new_path, _os_module=_os_module): """ Update the PATH environment variable by adding `new_path` to the front of PATH if `new_path` is not alreday in the PATH. """ # note: _os_module is used to facilitate mock testing using an # object with a sep string attribute and an environ mapping # attribute if not new_path: return new_path = new_path.strip() if not new_path: return path_env = _os_module.environ.get(b'PATH') if not path_env: # this is quite unlikely to ever happen, but here for safety path_env = '' # ensure we use unicode or bytes depending on OSes if on_linux: new_path = fsencode(new_path) path_env = fsencode(path_env) sep = _os_module.pathsep else: new_path = fsdecode(new_path) path_env = fsdecode(path_env) sep = unicode(_os_module.pathsep) path_segments = path_env.split(sep) # add lib path to the front of the PATH env var # this will use bytes on Linux and unicode elsewhere if new_path not in path_segments: if not path_env: new_path_env = new_path else: new_path_env = sep.join([new_path, path_env]) if not on_linux: # recode to bytes using FS encoding new_path_env = fsencode(new_path_env) # ... and set the variable back as bytes _os_module.environ[b'PATH'] = new_path_env
def extract_twice(location, target_dir, extractor1, extractor2): """ Extract a nested compressed archive at `location` to `target_dir` using the `extractor1` function to a temporary directory then the `extractor2` function on the extracted payload of `extractor1`. Return a list of warning messages. Raise exceptions on errors. Typical nested archives include compressed tarballs and RPMs (containing a compressed cpio). Note: it would be easy to support deeper extractor chains, but this gets hard to trace and debug very quickly. A depth of two is simple and sane and covers most common cases. """ if on_linux and py2: location = fileutils.fsencode(location) target_dir = fileutils.fsencode(target_dir) abs_location = os.path.abspath(os.path.expanduser(location)) abs_target_dir = compat.unicode( os.path.abspath(os.path.expanduser(target_dir))) # extract first the intermediate payload to a temp dir temp_target = compat.unicode( fileutils.get_temp_dir(prefix='extractcode-extract-')) warnings = extractor1(abs_location, temp_target) if TRACE: logger.debug('extract_twice: temp_target: %(temp_target)r' % locals()) # extract this intermediate payload to the final target_dir try: inner_archives = list( fileutils.resource_iter(temp_target, with_dirs=False)) if not inner_archives: warnings.append(location + ': No files found in archive.') else: for extracted1_loc in inner_archives: if TRACE: logger.debug( 'extract_twice: extractor2: %(extracted1_loc)r' % locals()) warnings.extend(extractor2(extracted1_loc, abs_target_dir)) finally: # cleanup the temporary output from extractor1 fileutils.delete(temp_target) return warnings
def new_name(location, is_dir=False): """ Return a new non-existing location from a `location` usable to write a file or create directory without overwriting existing files or directories in the same parent directory, ignoring the case of the filename. The case of the filename is ignored to ensure that similar results are returned across case sensitive (*nix) and case insensitive file systems. To find a new unique filename, this tries new names this way: * pad a directory name with _X where X is an incremented number. * pad a file base name with _X where X is an incremented number and keep the extension unchanged. """ assert location if on_linux: location = fsencode(location) location = location.rstrip(PATHS_SEPS) assert location parent = parent_directory(location) # all existing files or directory as lower case siblings_lower = set(s.lower() for s in os.listdir(parent)) filename = file_name(location) # corner case if filename in (DOT, DOT): filename = UNDERSCORE # if unique, return this if filename.lower() not in siblings_lower: return join(parent, filename) # otherwise seek a unique name if is_dir: # directories do not have an "extension" base_name = filename ext = EMPTY_STRING else: base_name, dot, ext = filename.partition(DOT) if dot: ext = dot + ext else: base_name = filename ext = EMPTY_STRING # find a unique filename, adding a counter int to the base_name counter = 1 while 1: filename = base_name + UNDERSCORE + str(counter) + ext if filename.lower() not in siblings_lower: break counter += 1 return join(parent, filename)
def __extract(self, test_path, extract_func=None, verbatim=False): """ Given an archive file identified by test_path relative to a test files directory, return a new temp directory where the archive file has been extracted using extract_func. If `verbatim` is True preserve the permissions. """ assert test_path and test_path != '' if on_linux and py2: test_path = fsencode(test_path) test_path = to_os_native_path(test_path) target_path = path.basename(test_path) target_dir = self.get_temp_dir(target_path) original_archive = self.get_test_loc(test_path) if on_linux and py2: target_dir = fsencode(target_dir) original_archive = fsencode(original_archive) extract_func(original_archive, target_dir, verbatim=verbatim) return target_dir
def to_os_native_path(path): """ Normalize a path to use the native OS path separator. """ if on_linux and py2: path = fsencode(path) path = path.replace(POSIX_PATH_SEP, OS_PATH_SEP) path = path.replace(WIN_PATH_SEP, OS_PATH_SEP) path = path.rstrip(OS_PATH_SEP) return path
def process_codebase(self, codebase, custom_output, custom_template, **kwargs): results = self.get_files(codebase, **kwargs) version = codebase.get_or_create_current_header().tool_version if on_linux and py2: custom_template = fsencode(custom_template) template_loc = custom_template output_file = custom_output write_templated(output_file, results, version, template_loc)
def get_temp_file(self, extension=None, dir_name='td', file_name='tf'): """ Return a unique new temporary file location to a non-existing temporary file that can safely be created without a risk of name collision. """ if extension is None: extension = '.txt' if on_linux and py2: extension = fsencode(extension) dir_name = fsencode(dir_name) file_name = fsencode(file_name) if extension and not extension.startswith(DOT): extension = DOT + extension file_name = file_name + extension temp_dir = self.get_temp_dir(dir_name) location = path.join(temp_dir, file_name) return location
def extract_zip(location, target_dir, *args, **kwargs): """ Extract a zip archive file at location in the target_dir directory. """ if not path.isfile(location) and zipfile.is_zipfile(location): raise Exception('Incorrect zip file %(location)r' % locals()) if on_linux and py2: location = fsencode(location) target_dir = fsencode(target_dir) with zipfile.ZipFile(location) as zipf: for info in zipf.infolist(): name = info.filename content = zipf.read(name) target = path.join(target_dir, name) if not path.exists(path.dirname(target)): os.makedirs(path.dirname(target)) if not content and target.endswith(path.sep): if not path.exists(target): os.makedirs(target) if not path.exists(target): with open(target, 'wb') as f: f.write(content)
def get_best_handler(location, kinds=all_kinds): """ Return the best handler of None for the file at location. """ if on_linux: location = fileutils.fsencode(location) location = os.path.abspath(os.path.expanduser(location)) if not filetype.is_file(location): return handlers = list(get_handlers(location)) if TRACE_DEEP: logger.debug('get_best_handler: handlers: %(handlers)r ' % locals()) if handlers: candidates = score_handlers(handlers) return candidates and pick_best_handler(candidates, kinds)
def test_scan_does_not_fail_when_scanning_unicode_test_files_from_express(): # On Windows, Python tar cannot extract these files. Other # extractors either fail or change the file name, making the test # moot. Git cannot check these files. So for now it makes no sense # to test this on Windows at all. Extractcode works fine, but does # rename the problematic files. test_dir = test_env.extract_test_tar_raw(b'unicode_fixtures.tar.gz') test_dir = fsencode(test_dir) args = [ '-n0', '--info', '--license', '--copyright', '--package', '--email', '--url', '--strip-root', '--json', '-', test_dir ] run_scan_click(args)