def _fixup_column_headers(self, filename): ''' If any columns were added to a file in the middle of the job run this after file is closed to make sure the first row column names are matched up. This is the only way to ensure the file has the correct information; which is unfortunate due to the potential expense of this operation. ''' # Create random file temp file in the same place as the outfile randomLetters = b64encode(os.urandom(16)).decode('utf-8') ValidChars = string.ascii_letters + string.digits randomLetters = ''.join( [c for c in randomLetters if (c in ValidChars)]) tmpFileName = "_surveyor_tmp{}_{}".format(randomLetters, filename) tempPath = os.path.join(self._outDir, tmpFileName) log.msg( 1, "Fixing output headers: {} ==> {}".format(tmpFileName, filename)) with open(tempPath, 'w', encoding='utf-8') as tempFile: # Write new header line tempFile.write( self._delimiter.join(self._col_create_names_from_keys( filename)) + '\n') # Move lines from original file to new, skipping header line oldPath = os.path.join(self._outDir, filename) with open(oldPath, 'r', encoding='utf-8') as oldFile: _header_line = oldFile.readline() for line in oldFile: tempFile.write(line) shutil.move(tempPath, oldPath)
def _is_file_survey_dupe(self, filePath, measures): ''' Simple mechanism to identify duplicate and near-dupicate code by tracking a dictionary of file measures. There are two modes: 1) File Size: Build a dictionary in memory based on a hash of fileName and config info. In the hash buckets store a dict of file sizes for the first of each size seen that is not within the dupe threshold. If a file size within the threshold of an existing hashed size, treat it as a dupe and increment count for reporting. 2) NBNC CRC: use the nbnc.crc measure to identify duplicates Note ASSUME the necessary file metadata will be present in the measures dicitonary, as basemodule.py puts it there for the Dupe option. ''' firstDupeFilePath = None # 1) File name and Size check if isinstance(self._dupeThreshold, int): fileSize = int(measures[basemodule.METADATA_FILESIZE]) dupeKey = (measures[basemodule.METADATA_FULLNAME] + measures[basemodule.METADATA_CONFIG].replace(' ', '')) if dupeKey in self._dupeFileSurveys: for dupeFileSize, (fileCount, firstFilePath) in self._dupeFileSurveys[dupeKey].items(): if (dupeFileSize - self._dupeThreshold) <= fileSize and ( fileSize <= (dupeFileSize + self._dupeThreshold)): firstDupeFilePath = firstFilePath self._dupeFileSurveys[dupeKey][dupeFileSize] = (fileCount + 1, firstFilePath) log.msg(1, "Dupe {} by {} of {} bytes: {}".format( fileCount, fileSize - dupeFileSize, fileSize, filePath)) break else: self._dupeFileSurveys[dupeKey] = {} if firstDupeFilePath is None: self._dupeFileSurveys[dupeKey][fileSize] = (1, filePath) log.file(2, "Added {} -- {} to dupe dictionary".format(dupeKey, fileSize)) # 2) Code CRC check # Our relying on the nbnc.crc is brittle, because it is both a code and runtime # dependency on the Code csmodule being used. And there are valid scenarios # where nbnc.crc may not be present (e.g., skipping dupe file). Thus if the # measure isn't present, we fail silently else: fileCrc = None try: fileCrc = measures['nbnc.crc'] except: log.file(2, "CRC Dupe - nbnc.crc missing: {}".format(filePath)) if fileCrc in self._dupeFileSurveys: fileCount, firstDupeFilePath = self._dupeFileSurveys[fileCrc] self._dupeFileSurveys[fileCrc] = (fileCount + 1, firstDupeFilePath) log.msg(1, "Dupe {}: {} DUPE_OF {}".format(fileCount, filePath, firstDupeFilePath)) elif fileCrc is not None: self._dupeFileSurveys[fileCrc] = (1, filePath) log.file(2, "Added {} -- {} to dupe dictionary".format(filePath, fileCrc)) return firstDupeFilePath
def __init__(self, configStack, options, file_measured_callback, status_callback): # Options define the life a job and cannot be modified self._options = options # All UI output is done through the status callback self._status_callback = status_callback # Keep track of (and allow access to) raw file metrics self.numFolders = 0 self.numUnfilteredFiles = 0 self.numFilteredFiles = 0 self.numFilesToProcess = 0 # Exceptions that occurred in workers are collected and displayed # Unlike errors, exceptions will not generate rows in output self.exceptions = [] # Queues to communicate with Workers, and the output thread self._taskQueue = multiprocessing.Queue() self._controlQueue = multiprocessing.Queue() self._outQueue = multiprocessing.Queue() self._outThread = jobout.OutThread(self._outQueue, self._controlQueue, self._options.profileName, file_measured_callback) # Create max number of workers (they will be started later as needed) assert self._options.numWorkers > 0, "Less than 1 worker requested!" context = (log.get_context(), self._options.profileName) self._workers = self.Workers(self._controlQueue, self._taskQueue, self._outQueue, context, self._options.numWorkers) log.msg(1, "Created {} workers".format(self._workers.num_max())) # Create our object for tracking state of folder walking self._pathsToMeasure = options.pathsToMeasure self._folderWalker = folderwalk.FolderWalker( options.deltaPath, configStack, options.recursive, options.includeFolders, options.skipFolders, options.fileFilters, options.skipFiles, self.add_folder_files) # Utility object for managing work packages; holds the state of the # work package that is being prepared for sending to queue self._workPackage = self.WorkPackage() # Other processing state self._continueProcessing = True self._taskPackagesSent = 0 self._filesSinceLastSend = 0
def _validate_line(self, configEntry): ''' Is the module being asked to do what it was designed to do? ''' measureOk = configEntry.module.can_do_measure( configEntry.measureFilters) verbOk = configEntry.module.can_do_verb(configEntry.verb) if not (measureOk and verbOk): log.msg( 1, "Failed module validate measureOk/verbOk: {}/{}".format( measureOk, verbOk)) raise utils.ConfigError( uistrings.STR_ErrorConfigInvalidMeasure.format( configEntry.verb, configEntry.measureFilter))
def read_file(self, filePath): ''' Read a Surveyor configuration file and return a list of ConfigEntrys to store on the configuration stack with this folder location. ''' try: log.msg(1, "Config file: {}".format(filePath)) configEntries = self._read_file(filePath, []) self._validate_file(configEntries) log.config(2, "Finsihed reading config file: {}".format(filePath)) log.config(3, configEntries) return configEntries except Exception as e: raise utils.ConfigError( uistrings.STR_ErrorConfigFile.format(filePath, str(e)))
def _open_file(filePath, forceAll): """ Manage the file opening with correct encoding based on any errors in decoding utf-8 default and through inspection of file start. This isn't foolproof - files that use different encodings farther in may blow up later if decoded, but that is rare. """ # Use buffering to reduce the cost of open on larger files fileObj = open(filePath, 'r', buffering=FILE_BUFFERING, encoding='utf_8') # Grab the first bytes of the file start = None try: try: start = _get_file_start(fileObj, FILE_START_UTF8_CHECK) except UnicodeDecodeError as e: fileObj.close() log.file(1, "UTF-8 error, using binary: {}".format(filePath)) fileObj = open(filePath, 'rb', buffering=FILE_BUFFERING) start = _get_file_start(fileObj, FILE_START_CHECK) except Exception as e2: log.msg(1, "Cannot open and read {}: {}".format(filePath, e2)) fileObj.close() # Do tests that look at start of the file if start: keepFileOpen = forceAll if not forceAll: if _is_noncode_file(start): log.file(1, "Skipping, non-code start: {}".format(filePath)) elif not filetype.is_text_file(start): log.file(1, "Skipping, binary char: {}".format(filePath)) else: keepFileOpen = True if not keepFileOpen: fileObj.close() fileObj = None else: if fileObj: fileObj.close() fileObj = None return fileObj
def run(self): log.cc(1, "STARTING: Begining to process output queue...") try: if self._profileName is not None: import cProfile cProfile.runctx('self._run()', globals(), {'self': self}, self._profileName + self.name) else: self._run() log.cc(1, "FINISHED processing output queue") except KeyboardInterrupt: log.cc(1, "Ctrl-c occurred in OUTPUT THREAD") _thread.interrupt_main() except Exception as e: log.msg(1, "EXCEPTION processing output queue: " + str(e)) log.stack() self._controlQueue.put_nowait(('JOB', 'EXCEPTION', e)) finally: log.cc(1, "TERMINATING")
def _write_aggregates(self): ''' For each set of aggregates, create an output file with aggregates that exceed threshold. HACK - use the output writer by creating a dummy OUT file tag ''' for keyName in list(self._aggregateNames.keys()): fileName = str(keyName).replace('.', '') hackOutTagMeasure = {'tag_write_aggregates': 'OUT:' + fileName} analysisRows = [] for valueRow in list(self._aggregates[keyName].values()): writeRow = self._aggregateThresholdKey is None if not writeRow: try: writeRow = valueRow[self._aggregateThresholdKey] > self._aggregateThreshold except KeyError as e: raise utils.InputException(STR_AggregateThresholdKeyError.format(str(e))) if writeRow: analysisRows.append(valueRow) log.msg(1, "Aggregate: {}".format(analysisRows)) self._writer.write_items(hackOutTagMeasure, analysisRows)
def _put_files_in_queue(self, path, deltaPath, filesAndConfigs): ''' Package files from the path into workItems that are grouped into workPackages and placed into the task queue for jobworkers. Packages are broken up if files number or total size exceeds thresholds to help evenly distribute load across cores ''' if not filesAndConfigs: return for fileName, configEntrys in filesAndConfigs: # Expensive to check file size here, but worth it for pracelling widely # varying file sizes out to cores for CPU intensive jobs. # Profiling shows it is not worth caching this try: fileSize = utils.get_file_size(os.path.join(path, fileName)) except Exception as e: # It is possible (at least in Windows) for a fileName to exist # in the file system but be invalid for Windows calls. This is # the first place the file is accessed through the file system; # if it blows up don't want the job to fall apart, and this is # an unusual case, so don't bother with a pathway back to the main # application; just swallow it and provide debug log.msg(1, str(e)) log.stack() continue log.cc(3, "WorkItem: {}, {}".format(fileSize, fileName)) self.numFilesToProcess += 1 workItem = (path, deltaPath, fileName, configEntrys, self._options, len(filesAndConfigs)) self._workPackage.add(workItem, fileSize) if self._workPackage.ready_to_send() or (self._filesSinceLastSend > MAX_FILES_BEFORE_SEND): self._send_current_package() if not self._check_command(): break
def __init__(self, configFileName, configOverrides, defaultConfigOptions=[]): log.config(2, "Creating ConfigStack with {}".format(configFileName)) self._modules = CodeSurveyorModules() self._reader = configreader.ConfigReader(self.load_csmodule) self._measureRootDir = '' # Stack of config files, represented as paths and lists of ConfigEntrys self._configStack = [] # Cache of config file information # Key is path name, value is list entries that represent the config file self._configFileCache = {} # List of default config option tags passed by the application self._defaultConfigOptions = defaultConfigOptions # Either use overrides or try to read config files if configOverrides: log.msg(1, "Ignoring config files: {}".format(configOverrides)) self._configName = '' self._setup_config_overrides(configOverrides) else: self._configName = configFileName # Make sure the config file name does not include a path, as the point is # to look for a config file in each folder we visit if not os.path.dirname(self._configName) == '': raise utils.ConfigError( uistrings.STR_ErrorConfigFileNameHasPath) # Load the default config file to use for this job # First try in the root of the job folder; then in the surveyor folder if not self._push_file(runtime_dir()): if not self._push_file(surveyor_dir()): log.msg( 1, "{} not present in default locations".format( self._configName))
def _validate_entries(self, configEntries): ''' Are all config file entries consistent with each other, to avoid silent double counting? Throws an error exception if not. ''' log.config(2, "Checking for duplicate config entries") # Create list of all possible measure/file combos # Ask the module to match each measure, to catch wildcard overlap fileFilters = [] possibleMeasures = [] for entry in configEntries: for fileFilter in entry.fileFilters: fileFilters.append(fileFilter) possibleMeasures.append( (fileFilter, entry.measureFilter, entry.moduleName, entry.verb, entry.tags, entry.paramsRaw)) log.config(4, fileFilters) log.config(4, possibleMeasures) # Check that no file type would have a measure be double counted # If a problem, throw an exception based on the first problem item if len(fileFilters) > len(set(fileFilters)): while possibleMeasures: possibleMeasureTuple = possibleMeasures.pop() log.config(2, "possibleMeasure: {}".format(possibleMeasureTuple)) (fileFilter, measureFilter, modName, verb, tags, extraParams) = possibleMeasureTuple # Don't attempt the do conflict resolution on regex files extensions, # both because it doesn't make sense if fileFilter.startswith(fileext.CUSTOM_FILE_REGEX): continue # Shallow warning check for double counting by creatubg a list of entries # based on matching verb and file type warningList = [ (ff, mf, mn, v, t, ep) for ff, mf, mn, v, t, ep in possibleMeasures if v == verb and fileext.file_ext_match(ff, fileFilter) ] if warningList: log.config( 1, "WARNING - Possible double-count: {}".format( str(warningList))) # For the deep check look at tag values and measure filter dupeList = [ (v, modName, mn, mf, fileFilter, ff, t, tags, ep, extraParams) for ff, mf, mn, v, t, ep in warningList if len(t) == len(tags) and len(t) == len(set(t) & set(tags)) and entry.module.match_measure(mf, measureFilter) ] if dupeList: log.msg( 1, "ERROR - Double-count: {}".format(str(dupeList))) dupe = dupeList[0] raise utils.ConfigError( uistrings.STR_ErrorConfigDupeMeasures.format( dupe[0], dupe[1], dupe[2], dupe[3], dupe[4], dupe[5], dupe[6], dupe[7], dupe[8], dupe[9]))
def parse_args(self): ''' Do simple command line parsing and set the internal state of our Surveyor class based on the arguments. For any syntax we don't recognize or help is requested, return help text. Otherwise return None which indicates success. ''' try: while not self.args.finished(): self.args.move_next() # Disambiguation case for measurePath/fileFilter # A '-' may be used to replace optional arg with path/filter if self.args.is_cmd() and len(self.args.get_current()) == 1: if self.args.is_param_next(): self.args.move_next() self._parse_measurement_path() continue # Assume non-Arg is a measurePath/fileFilter definition elif not self.args.is_cmd(): self._parse_measurement_path() continue # Our processing is based on matching first character fc = self.args.get_current()[1].lower() # Debug and profiling support if fc in CMDARG_DEBUG: self._parse_debug_options() log.msg(2, "Args: {}".format(str(self.args))) elif fc in CMDARG_PROFILE: self._app._profiling = True self._app._profileCalls = self._get_next_int(optional=True, default=self._app._profileCalls) self._app._profileCalledBy = self._get_next_int(optional=True, default=self._app._profileCalledBy) self._app._profileCalled = self._get_next_int(optional=True, default=self._app._profileCalled) self._app._profileThreadFilter = self._get_next_str(optional=True, default=self._app._profileThreadFilter) self._app._profileNameFilter = self._get_next_str(optional=True, default=self._app._profileNameFilter) # Config file settings elif fc in CMDARG_CONFIG_CUSTOM: self._parse_config_options() # Delta path elif fc in CMDARG_DELTA: self._parse_delta_options() # Duplicate processing # Can have an optional integer or string after this option elif fc in CMDARG_DUPE_PROCESSING: self._app._dupeTracking = True self._metaDataOptions['DUPE'] = None dupeParam = self._get_next_param(optional=True) try: dupeParam = int(dupeParam) except Exception as e: pass self._app._dupeThreshold = dupeParam # Scan and skip options elif fc in CMDARG_SCAN_ALL: self._parse_scan_options() elif fc in CMDARG_SKIP: self._parse_skip_options() elif fc in CMDARG_INCLUDE_ONLY: self._app._jobOpt.includeFolders.extend(self._get_next_param().split(CMDLINE_SEPARATOR)) # Output elif fc in CMDARG_METADATA == fc: self._parse_metadata_options() elif fc in CMDARG_OUTPUT_FILTER: self._measureFilter = self._get_next_str() elif fc in CMDARG_OUTPUT_TYPE == fc: self._app._outType = self._get_next_str() elif fc in CMDARG_OUTPUT_FILE == fc: self._parse_output_file() elif fc in CMDARG_SUMMARY_ONLY == fc: self._app._summaryOnly = True elif fc in CMDARG_DETAILED == fc: self._app._detailed = True self._app._detailedPrintSummaryMax = self._get_next_int( optional=True, default=self._app._detailedPrintSummaryMax) elif fc in CMDARG_PROGRESS == fc: self._app._progress = True self._app._printMaxWidth = self._get_next_int( optional=True, default=self._app._printMaxWidth) elif fc in CMDARG_QUIET == fc: self._app._quiet = True # Other options elif fc in CMDARG_NUM_WORKERS: self._app._jobOpt.numWorkers = self._get_next_int(validRange=range(1,MAX_WORKERS)) elif fc in CMDARG_RECURSION: self._app._jobOpt.recursive = False elif fc in CMDARG_BREAK_ERROR: self._app._jobOpt.breakOnError = True elif fc in CMDARG_AGGREGATES: self._parse_aggregate_options() # Help/invalid parameter request else: return self._parse_help_options() # Setup the default measurement path if not provided if not self._app._jobOpt.pathsToMeasure: self._app._jobOpt.pathsToMeasure.append(utils.CURRENT_FOLDER) # Setup the default config name if not provided if not self.configOverrides and self.configCustom is None: self.configCustom = CONFIG_FILE_DEFAULT_NAME except Args.ArgsFinishedException as e: raise utils.InputException(STR_ErrorParsingEnd.format(str(e))) else: log.config(4, vars(self._app))
def move_next(self): if self.finished(): raise self.ArgsFinishedException(self.get_current()) self.argPos += 1 log.msg(1, "Arg: {}".format(str(self.argList[self.argPos])))