def _skip_file(self, d, files): ''' The function passed into shutil.copytree to ignore certain patterns and filetypes Currently Skipped Directories - handled by copytree Symlinks - handled by copytree Write-only files (stuff in /proc) Binaries (can't scan them) ''' skip_list = [] for f in files: f_full = os.path.join(d, f) if not os.path.isdir(f_full): if not os.path.islink(f_full): #mode = oct(os.stat(f_full).st_mode)[-3:] # executing as root makes this first if clause useless. # i thought i'd already removed it. - jduncan #if mode == '200' or mode == '444' or mode == '400': # skip_list.append(f) mime_type = content_type.from_file(f_full) if f == 'insights_archive.txt': # don't exclude this file! we need it to parse core collection archives continue if 'text' not in mime_type and 'json' not in mime_type: skip_list.append(f) return skip_list
def _assert_type(self, _input, is_buffer=False): self.content_type = content_type.from_file(_input) if self.content_type not in self.TAR_FLAGS: raise InvalidContentType(self.content_type) inner_type = content_type.from_file_inner(_input) if inner_type != 'application/x-tar': raise InvalidArchive('No compressed tar archive')
def _extract_sosreport(self, path): self.logger.con_out("Beginning SOSReport Extraction") compression_sig = content_type.from_file(path).lower() if 'directory' in compression_sig: self.logger.info('%s appears to be a %s - continuing', path, compression_sig) # Clear out origin_path as we don't have one self.origin_path = None return path elif 'compressed data' in compression_sig: if compression_sig == 'xz compressed data': #This is a hack to account for the fact that the tarfile library doesn't #handle lzma (XZ) compression until version 3.3 beta try: self.logger.info( 'Data Source Appears To Be LZMA Encrypted Data - decompressing into %s', self.origin_path) self.logger.info('LZMA Hack - Creating %s', self.origin_path) os.system('mkdir %s' % self.origin_path) os.system('tar -xJf %s -C %s' % (path, self.origin_path)) return_path = os.path.join(self.origin_path, os.listdir(self.origin_path)[0]) return return_path except Exception as e: # pragma: no cover self.logger.exception(e) raise Exception( 'DecompressionError, Unable to decrypt LZMA compressed file %s', path) else: p = tarfile.open(path, 'r') self.logger.info( 'Data Source Appears To Be %s - decompressing into %s', compression_sig, self.origin_path) try: p.extractall(self.origin_path) return_path = os.path.join( self.origin_path, os.path.commonprefix(p.getnames())) return return_path except Exception as e: # pragma: no cover self.logger.exception(e) raise Exception( "DeCompressionError: Unable to De-Compress %s into %s", path, self.origin_path) else: # pragma: no cover raise Exception( 'CompressionError: Unable To Determine Compression Type')
def extract(path, timeout=None, extract_dir=None): content_type = from_file(path) if content_type == "application/zip": extractor = ZipExtractor(timeout=timeout) else: extractor = TarExtractor(timeout=timeout) tmp_dir = None try: tmp_dir = extractor.from_path(path, extract_dir=extract_dir).tmp_dir content_type = extractor.content_type yield Extraction(tmp_dir, content_type) finally: if tmp_dir: fs.remove(tmp_dir, chmod=True)
def analyze(paths, excludes=None): if not isinstance(paths, list): paths = [paths] results = [] for path in paths: if content_type.from_file(path) == "text/plain": results.append(_load(path)) elif os.path.isdir(path): results.extend(_process(path, excludes)) else: with extract(path) as ex: results.extend(_process(ex.tmp_dir, excludes)) return Result(children=results)