def _run(self): # Keep processing queue until the job signals it is done and # the queue is empty, or receive an abort command while self._continue_processing(): try: if self._workDone and self._outQueue.empty(): break filesOutput = self._outQueue.get_nowait() except Empty: log.cc(3, "EMPTY OUTPUT") time.sleep(OUTPUT_EMPTY_WAIT) else: self.taskPackagesReceived += 1 log.cc(2, "GOT {} measures".format(len(filesOutput))) # Get a set of output for multiple files with each outputQueue item. # Each file has a set of output and errors to pack up for app for filePath, outputList, errorList in filesOutput: # Synchronus callback to applicaiton # Output writing and screen update occurs in this call self._file_measure_callback(filePath, outputList, errorList) if errorList: log.file(1, "ERROR measuring: {}".format(filePath)) self._controlQueue.put_nowait( ('JOB', 'ERROR', filePath))
def file_measured_callback(self, filePath, outputList, errorList): ''' Job output thread callback to provide file measurements. A list of output and potential errors is provided for each file. Called ONCE for each file in the job; if there were multiple config entries for the file, outputList will have multiple items. ''' self._numFilesProcessed += 1 self._errorList.extend(errorList) fileTime = 0 fileMeasured = False for measures, analysisResults in outputList: log.file(3, "Callback: {} -- {}".format(filePath, measures)) if list(measures.items()): # Zero out dupe measures in place if self._dupeTracking: self._filter_dupes(filePath, measures, analysisResults) # Send results to metrics writer fileMeasured = True self._numMeasures += max(1, len(analysisResults)) if not self._summaryOnly: self._writer.write_items(measures, analysisResults) # Capture summary metrics and aggregates self._stash_summary_metrics(filePath, measures, analysisResults) self._stash_aggregates(filePath, analysisResults) fileTime += utils.safe_dict_get_float(measures, basemodule.METADATA_TIMING) self._numFilesMeasured += (1 if fileMeasured else 0) self._display_file_progress(filePath, fileTime) self._display_feedback()
def _valid_folder(self, folderName): ''' Is this folder one we should process? ''' if not self._skipFolders and not self._includeFolders: return True validFolder = True # First verify this folder is not to be skipped if self._skipFolders: _root, currentFolder = os.path.split(folderName) for folderPattern in self._skipFolders: if fnmatch.fnmatch(currentFolder, folderPattern): log.file(1, "Skiping folder: %s" % folderName) validFolder = False break # Next verify if it is on the include list if validFolder and self._includeFolders: includeMatch = False for folderPattern in self._includeFolders: if fnmatch.fnmatch(folderName, folderPattern): includeMatch = True break if not includeMatch: log.file(1, "Excluding folder: %s" % folderName) validFolder = False return validFolder
def _file_match(fileName, fileFilter): ''' Performs the match check of filename to filter In the case of blank detection, look for no extension Otherwise use regex comparison using cached version of either the re from fnmatch.translate or custom RE string provided in filter ''' if BLANK_FILE_EXT == fileFilter: root, ext = os.path.splitext(fileName) filterMatch = ('' == ext and not root.startswith('.')) else: filterRe = None try: filterRe = _FilterCache[fileFilter] except KeyError: if fileFilter.startswith(CUSTOM_FILE_REGEX): filterRe = re.compile(fileFilter.replace(CUSTOM_FILE_REGEX, ''), RE_OPTIONS) else: filterRe = re.compile(fnmatch.translate(fileFilter), RE_OPTIONS) _FilterCache[fileFilter] = filterRe filterMatch = filterRe.match(fileName) if log.level() > 3 and filterMatch is None: log.file(4, "FilterExtFilter: %s, no match: %s" % (filterRe.pattern[:10], fileName)) return filterMatch is not None
def open_file_for_survey(filePath, existingFile, forceAll, sizeThreshold): ''' Includes logic for handling different file encodings and options for skipping files based on different detections of content in the file. existingFile is used as optimization to prevent reopening a file multiple times. ''' # Check extensions first, since already have data if not forceAll and filetype.is_noncode_ext(filePath): log.file(1, "Skipping, non-code ext: {}".format(filePath)) return # Then check for size threshold; faster than opening file if sizeThreshold > 0: fileSize = utils.get_file_size(filePath) if sizeThreshold < fileSize: log.file(1, "Skipping, size {}: {}".format(fileSize, filePath)) return # Reset an existing file, or open a new one if existingFile: existingFile.seek(0) rv = existingFile else: rv = _open_file(filePath, forceAll) return rv
def _open_file(self, filename): MeasureWriter._open_file(self, filename) filePath = os.path.join(self._outDir, filename) outFile = open(filePath, 'w', encoding='utf-8') doc = minidom.Document() outFile.write(doc.toprettyxml()) log.file(2, "Opened XML Output File: {}".format(filePath)) return outFile
def _open_file(self, fileName): MeasureWriter._open_file(self, fileName) filePath = os.path.join(self._outDir, fileName) self._rawFiles[fileName] = open(filePath, 'w', encoding='utf-8', newline='') outWriter = csv.writer(self._rawFiles[fileName], delimiter=self._delimiter, quoting=csv.QUOTE_NONNUMERIC) log.file(2, "Opened Delimited Output File: {}".format(filePath)) return outWriter
def _remove_skip_dirs(self, root, dirs): ''' Decide what children dirs should be skipped Filter out dirs in place (vs a copy), so os.walk will skip ''' dirsToRemove = [] for folderPattern in self._skipFolders: dirsToRemove += fnmatch.filter(dirs, folderPattern) dirsToRemove = set(dirsToRemove) for folder in dirsToRemove: log.file(1, "Skiping over: %s\\%s" % (root, folder)) dirs.remove(folder)
def _get_files_to_process(self, folderName, fileNames, fileFilters, configPath): ''' Filter the list of files based on command line options and active config file filters ''' # if fileFilters is empty it means an empty config file, so skip all files if not fileFilters: return [] # Optimize the most common matching of extensions by creating cache of # simple '*.xxx' extensions from config filters for each config file filterExts = [] try: filterExts = self._configFilterCache[configPath] except KeyError: filterSplits = [os.path.splitext(fileFilter) for fileFilter in fileFilters if os.path.splitext(fileFilter)[0] == '*'] filterExts = [ext for _root, ext in filterSplits] self._configFilterCache[configPath] = filterExts # Select files based on matching filters filesToProcess = [] for fileName in fileNames: # Filter file list by command-line postive filter, if provided if fileext.file_matches_filters(fileName, self._fileExtFilters): # Optimize most common case of direct match of file extension, then # fall back to doing a full filter match on config file filter _root, fileExt = os.path.splitext(fileName) fileFilter = None if fileExt in filterExts: fileFilter = '*' + fileExt else: fileFilter = fileext.file_matches_filters(fileName, fileFilters) if fileFilter is not None: filesToProcess.append((fileName, fileFilter)) # Remove files that should be skipped if self._skipFiles: filesToProcess = [(fileName, fileFilter) for fileName, fileFilter in filesToProcess if not fileext.file_matches_filters(fileName, self._skipFiles)] # Debug tracing of files that were not measured if log.level(): filesSkipped = set(fileNames) - set([f for f, _filter in filesToProcess]) if filesSkipped: log.file(2, "SkippingFiles: %s" % filesSkipped) return filesToProcess
def walk(self, pathToMeasure): ''' Walk folders while filtering sending updates via callback May be asked to terminate in our callback ''' self._configStack.set_measure_root(pathToMeasure) for folderName, childFolders, fileNames in os.walk(pathToMeasure, topdown=True): log.file(2, "Scanning: {}".format(folderName)) numUnfilteredFiles = len(fileNames) filesAndConfigs = [] if fileNames and self._valid_folder(folderName): # Get the current set of active config filters fileFilters, activeConfigs, configPath = self._configStack.get_configuration(folderName) # Filter out files by options and config items filesToProcess = self._get_files_to_process(folderName, fileNames, fileFilters, configPath) # Create list of tuples with fileName and configEntrys for each file for fileName, fileFilter in filesToProcess: configEntrys = self._get_configs_for_file(fileName, fileFilter, activeConfigs, configPath) filesAndConfigs.append((fileName, configEntrys)) # For delta measure create a fully qualified delta path name # Note when we split on path to measure, it will start with seperator deltaFolder = None if self._deltaPath is not None: deltaFolder = self._deltaPath + folderName[len(pathToMeasure):] # Call back to job with files and configs continueProcessing = self._add_files_to_job( folderName, deltaFolder, filesAndConfigs, numUnfilteredFiles) if not continueProcessing or not self._expandSubdirs: break # Remove any folders, and sort remaining to ensure consistent walk # order across file systems (for our testing if nothing else) self._remove_skip_dirs(folderName, childFolders) childFolders.sort()
def _stash_aggregates(self, filePath, analysisResults): ''' As file results received, if requests to aggregate results, store aggregate information. The aggreate functionality is based on names of items generated by specific csmodules; consider it a fatal error if what is requested for aggregation and what is present in analysisResults are out of sync ''' # For each set of aggregates go through results and add # them to the appropriate aggregate set for aggKey, aggNames in self._aggregateNames.items(): aggregateDict = self._aggregates.setdefault(aggKey, {}) log.file(2, "Aggregating {} items in {}".format(len(analysisResults), aggKey)) for result in analysisResults: # aggKey has the name for the value from results that we # will be keying the aggreate dictionary on try: newKey = result[aggKey] except KeyError as e: raise utils.InputException(STR_AggregateKeyError.format(str(e))) else: aggregate = aggregateDict.setdefault(newKey, {'aggregate.count':0}) # Sepcific names can be provided to aggregate, or can do all namesToAggregate = aggNames if isinstance(aggNames, str): if aggNames == 'all': namesToAggregate = list(result.keys()) # Take each value from the result and aggregate according to type for itemName in namesToAggregate: self._aggregate_update(itemName, result[itemName], aggregate) # Count the item aggregate['aggregate.count'] += 1 # Update the aggregate aggregateDict[newKey] = aggregate # The dictionary for this aggKey has been updated, so stash it self._aggregates[aggKey] = aggregateDict
def _is_file_survey_dupe(self, filePath, measures): ''' Simple mechanism to identify duplicate and near-dupicate code by tracking a dictionary of file measures. There are two modes: 1) File Size: Build a dictionary in memory based on a hash of fileName and config info. In the hash buckets store a dict of file sizes for the first of each size seen that is not within the dupe threshold. If a file size within the threshold of an existing hashed size, treat it as a dupe and increment count for reporting. 2) NBNC CRC: use the nbnc.crc measure to identify duplicates Note ASSUME the necessary file metadata will be present in the measures dicitonary, as basemodule.py puts it there for the Dupe option. ''' firstDupeFilePath = None # 1) File name and Size check if isinstance(self._dupeThreshold, int): fileSize = int(measures[basemodule.METADATA_FILESIZE]) dupeKey = (measures[basemodule.METADATA_FULLNAME] + measures[basemodule.METADATA_CONFIG].replace(' ', '')) if dupeKey in self._dupeFileSurveys: for dupeFileSize, (fileCount, firstFilePath) in self._dupeFileSurveys[dupeKey].items(): if (dupeFileSize - self._dupeThreshold) <= fileSize and ( fileSize <= (dupeFileSize + self._dupeThreshold)): firstDupeFilePath = firstFilePath self._dupeFileSurveys[dupeKey][dupeFileSize] = (fileCount + 1, firstFilePath) log.msg(1, "Dupe {} by {} of {} bytes: {}".format( fileCount, fileSize - dupeFileSize, fileSize, filePath)) break else: self._dupeFileSurveys[dupeKey] = {} if firstDupeFilePath is None: self._dupeFileSurveys[dupeKey][fileSize] = (1, filePath) log.file(2, "Added {} -- {} to dupe dictionary".format(dupeKey, fileSize)) # 2) Code CRC check # Our relying on the nbnc.crc is brittle, because it is both a code and runtime # dependency on the Code csmodule being used. And there are valid scenarios # where nbnc.crc may not be present (e.g., skipping dupe file). Thus if the # measure isn't present, we fail silently else: fileCrc = None try: fileCrc = measures['nbnc.crc'] except: log.file(2, "CRC Dupe - nbnc.crc missing: {}".format(filePath)) if fileCrc in self._dupeFileSurveys: fileCount, firstDupeFilePath = self._dupeFileSurveys[fileCrc] self._dupeFileSurveys[fileCrc] = (fileCount + 1, firstDupeFilePath) log.msg(1, "Dupe {}: {} DUPE_OF {}".format(fileCount, filePath, firstDupeFilePath)) elif fileCrc is not None: self._dupeFileSurveys[fileCrc] = (1, filePath) log.file(2, "Added {} -- {} to dupe dictionary".format(filePath, fileCrc)) return firstDupeFilePath
def _open_file(filePath, forceAll): """ Manage the file opening with correct encoding based on any errors in decoding utf-8 default and through inspection of file start. This isn't foolproof - files that use different encodings farther in may blow up later if decoded, but that is rare. """ # Use buffering to reduce the cost of open on larger files fileObj = open(filePath, 'r', buffering=FILE_BUFFERING, encoding='utf_8') # Grab the first bytes of the file start = None try: try: start = _get_file_start(fileObj, FILE_START_UTF8_CHECK) except UnicodeDecodeError as e: fileObj.close() log.file(1, "UTF-8 error, using binary: {}".format(filePath)) fileObj = open(filePath, 'rb', buffering=FILE_BUFFERING) start = _get_file_start(fileObj, FILE_START_CHECK) except Exception as e2: log.msg(1, "Cannot open and read {}: {}".format(filePath, e2)) fileObj.close() # Do tests that look at start of the file if start: keepFileOpen = forceAll if not forceAll: if _is_noncode_file(start): log.file(1, "Skipping, non-code start: {}".format(filePath)) elif not filetype.is_text_file(start): log.file(1, "Skipping, binary char: {}".format(filePath)) else: keepFileOpen = True if not keepFileOpen: fileObj.close() fileObj = None else: if fileObj: fileObj.close() fileObj = None return fileObj
def _survey_lines(self, linesToSurvey, params, measurements, analysis): ''' Analyze file line by line. linesToSurvey is an iterable set of lines. Processing is driven by the regular expressions in member variables. The order of processing each line is: - Preprocess line string - Detect machine vs. human code - Detect blank lines - Detect single and multi-line comments - Capture line measures - Peform line processing (searches, routines, etc.) ''' # Setup dictionary for measures and searches we'll do self._survey_start(params) # If no lines to process, may still want to output empty measures if linesToSurvey is None: linesToSurvey = [] # Track whether inside a multi-line comment - ignore nesting scanningMultiLine = False for bufferLine in linesToSurvey: # Handle option of reading out binary files bufferLine = utils.safe_string(bufferLine) self.counts['RawLines'][self._activeBlock] += 1 if self._logLevel: log.file(4, "Raw: {}".format(bufferLine)) try: # Allow specializations to skip and/or special-case certain lines if self._alternate_line_processing(bufferLine): continue # If line seperator, apply it lines = [bufferLine] if self.addLineSep is not None: lines = bufferLine.split(self.addLineSep) # # Read through the lines to measure and process them one at a time # This is the main measure loop for csmodules derived from NBNC # for rawLine in lines: self.counts['TotalLines'][self._activeBlock] += 1 # Allow for clean up of artifacts or other pre-processing line = self._preprocess_line(rawLine) # Detect true blank lines if self.reTrueBlankLine.match(line): self.counts['TrueBlankLines'][self._activeBlock] += 1 self._log_line(line, "T") continue # Block Detection if len(self.blockDetectors) > 1: if self._detect_block_change(line, analysis): scanningMultiLine = False # Don't allow multi-line comment to span blocks # Determine comment state # This is done before blank lines to consider multi-line # comment syntax that will be counted as "blank", e.g., /* on it's own line onCommentLine, scanningMultiLine = self._detect_line_comment(line, scanningMultiLine) # Detect "blank" lines with no useful info if self._detect_blank_line(line): continue # Measure and analyze -- overriden in derived classes self._measure_line(line, onCommentLine) self._analyze_line(line, analysis, onCommentLine) except Exception as e: log.stack() if self.stopOnError: raise utils.FileMeasureError( "Problem processing line: {} with module: {}\n{}".format( str(sum(self.counts['RawLines'])), self.__class__.__name__, str(e))) # Package results self._survey_end(measurements, analysis)