示例#1
0
    def _run(self):
        # Keep processing queue until the job signals it is done and
        # the queue is empty, or receive an abort command
        while self._continue_processing():
            try:
                if self._workDone and self._outQueue.empty():
                    break
                filesOutput = self._outQueue.get_nowait()

            except Empty:
                log.cc(3, "EMPTY OUTPUT")
                time.sleep(OUTPUT_EMPTY_WAIT)
            else:
                self.taskPackagesReceived += 1
                log.cc(2, "GOT {} measures".format(len(filesOutput)))

                # Get a set of output for multiple files with each outputQueue item.
                # Each file has a set of output and errors to pack up for app
                for filePath, outputList, errorList in filesOutput:

                    # Synchronus callback to applicaiton
                    # Output writing and screen update occurs in this call
                    self._file_measure_callback(filePath, outputList,
                                                errorList)

                    if errorList:
                        log.file(1, "ERROR measuring: {}".format(filePath))
                        self._controlQueue.put_nowait(
                            ('JOB', 'ERROR', filePath))
示例#2
0
    def file_measured_callback(self, filePath, outputList, errorList):
        '''
        Job output thread callback to provide file measurements.
        A list of output and potential errors is provided for each file.
        Called ONCE for each file in the job; if there were multiple
        config entries for the file, outputList will have multiple items.
        '''
        self._numFilesProcessed += 1
        self._errorList.extend(errorList)

        fileTime = 0
        fileMeasured = False
        for measures, analysisResults in outputList:
            log.file(3, "Callback: {} -- {}".format(filePath, measures))
            if list(measures.items()):
                # Zero out dupe measures in place
                if self._dupeTracking:
                    self._filter_dupes(filePath, measures, analysisResults)

                # Send results to metrics writer
                fileMeasured = True
                self._numMeasures += max(1, len(analysisResults))
                if not self._summaryOnly:
                    self._writer.write_items(measures, analysisResults)

                # Capture summary metrics and aggregates
                self._stash_summary_metrics(filePath, measures, analysisResults)
                self._stash_aggregates(filePath, analysisResults)

                fileTime += utils.safe_dict_get_float(measures, basemodule.METADATA_TIMING)

        self._numFilesMeasured += (1 if fileMeasured else 0)
        self._display_file_progress(filePath, fileTime)
        self._display_feedback()
示例#3
0
    def _valid_folder(self, folderName):
        '''
        Is this folder one we should process?
        '''
        if not self._skipFolders and not self._includeFolders:
            return True

        validFolder = True

        # First verify this folder is not to be skipped
        if self._skipFolders:
            _root, currentFolder = os.path.split(folderName)
            for folderPattern in self._skipFolders:
                if fnmatch.fnmatch(currentFolder, folderPattern):
                    log.file(1, "Skiping folder: %s" % folderName)
                    validFolder = False
                    break

        # Next verify if it is on the include list
        if validFolder and self._includeFolders:
            includeMatch = False
            for folderPattern in self._includeFolders:
                if fnmatch.fnmatch(folderName, folderPattern):
                    includeMatch = True
                    break
            if not includeMatch:
                log.file(1, "Excluding folder: %s" % folderName)
                validFolder = False

        return validFolder
示例#4
0
def _file_match(fileName, fileFilter):
    '''
    Performs the match check of filename to filter
    In the case of blank detection, look for no extension
    Otherwise use regex comparison using cached version of either the
    re from fnmatch.translate or custom RE string provided in filter
    '''
    if BLANK_FILE_EXT == fileFilter:
        root, ext = os.path.splitext(fileName)
        filterMatch = ('' == ext and not root.startswith('.'))
    else:
        filterRe = None
        try:
            filterRe = _FilterCache[fileFilter]
        except KeyError:
            if fileFilter.startswith(CUSTOM_FILE_REGEX):
                filterRe = re.compile(fileFilter.replace(CUSTOM_FILE_REGEX, ''), RE_OPTIONS)
            else:
                filterRe = re.compile(fnmatch.translate(fileFilter), RE_OPTIONS)
            _FilterCache[fileFilter] = filterRe

        filterMatch = filterRe.match(fileName)

        if log.level() > 3 and filterMatch is None:
            log.file(4, "FilterExtFilter: %s, no match:  %s" % (filterRe.pattern[:10], fileName))

        return filterMatch is not None
示例#5
0
def open_file_for_survey(filePath, existingFile, forceAll, sizeThreshold):
    '''
    Includes logic for handling different file encodings and 
    options for skipping files based on different detections
    of content in the file.
    existingFile is used as optimization to prevent reopening
    a file multiple times.
    '''
    # Check extensions first, since already have data
    if not forceAll and filetype.is_noncode_ext(filePath):
        log.file(1, "Skipping, non-code ext: {}".format(filePath))
        return

    # Then check for size threshold; faster than opening file
    if sizeThreshold > 0:
        fileSize = utils.get_file_size(filePath)
        if sizeThreshold < fileSize:
            log.file(1, "Skipping, size {}: {}".format(fileSize, filePath))
            return

    # Reset an existing file, or open a new one
    if existingFile:
        existingFile.seek(0)
        rv = existingFile
    else:
        rv = _open_file(filePath, forceAll)
    return rv
示例#6
0
 def _open_file(self, filename):
     MeasureWriter._open_file(self, filename)
     filePath = os.path.join(self._outDir, filename)
     outFile = open(filePath, 'w', encoding='utf-8')
     doc = minidom.Document()
     outFile.write(doc.toprettyxml())
     log.file(2, "Opened XML Output File: {}".format(filePath))
     return outFile
示例#7
0
 def _open_file(self, fileName):
     MeasureWriter._open_file(self, fileName)
     filePath = os.path.join(self._outDir, fileName)
     self._rawFiles[fileName] = open(filePath,
                                     'w',
                                     encoding='utf-8',
                                     newline='')
     outWriter = csv.writer(self._rawFiles[fileName],
                            delimiter=self._delimiter,
                            quoting=csv.QUOTE_NONNUMERIC)
     log.file(2, "Opened Delimited Output File: {}".format(filePath))
     return outWriter
示例#8
0
    def _remove_skip_dirs(self, root, dirs):
        '''
        Decide what children dirs should be skipped
        Filter out dirs in place (vs a copy), so os.walk will skip
        '''
        dirsToRemove = []
        for folderPattern in self._skipFolders:
            dirsToRemove += fnmatch.filter(dirs, folderPattern)
        dirsToRemove = set(dirsToRemove)

        for folder in dirsToRemove:
            log.file(1, "Skiping over: %s\\%s" % (root, folder))
            dirs.remove(folder)
示例#9
0
    def _get_files_to_process(self, folderName, fileNames, fileFilters, configPath):
        '''
        Filter the list of files based on command line options and active
        config file filters
        '''
        # if fileFilters is empty it means an empty config file, so skip all files
        if not fileFilters:
            return []

        # Optimize the most common matching of extensions by creating cache of
        # simple '*.xxx' extensions from config filters for each config file
        filterExts = []
        try:
            filterExts = self._configFilterCache[configPath]
        except KeyError:
            filterSplits = [os.path.splitext(fileFilter) for fileFilter in fileFilters if
                                os.path.splitext(fileFilter)[0] == '*']
            filterExts = [ext for _root, ext in filterSplits]
            self._configFilterCache[configPath] = filterExts

        # Select files based on matching filters
        filesToProcess = []
        for fileName in fileNames:

            # Filter file list by command-line postive filter, if provided
            if fileext.file_matches_filters(fileName, self._fileExtFilters):

                # Optimize most common case of direct match of file extension, then
                # fall back to doing a full filter match on config file filter
                _root, fileExt = os.path.splitext(fileName)
                fileFilter = None
                if fileExt in filterExts:
                    fileFilter = '*' + fileExt
                else:
                    fileFilter = fileext.file_matches_filters(fileName, fileFilters)
                if fileFilter is not None:
                    filesToProcess.append((fileName, fileFilter))

        # Remove files that should be skipped
        if self._skipFiles:
            filesToProcess = [(fileName, fileFilter) for fileName, fileFilter in filesToProcess if
                                not fileext.file_matches_filters(fileName, self._skipFiles)]

        # Debug tracing of files that were not measured
        if log.level():
            filesSkipped = set(fileNames) - set([f for f, _filter in filesToProcess])
            if filesSkipped:
                log.file(2, "SkippingFiles: %s" % filesSkipped)

        return filesToProcess
示例#10
0
    def walk(self, pathToMeasure):
        '''
        Walk folders while filtering sending updates via callback
        May be asked to terminate in our callback
        '''
        self._configStack.set_measure_root(pathToMeasure)

        for folderName, childFolders, fileNames in os.walk(pathToMeasure, topdown=True):
            log.file(2, "Scanning: {}".format(folderName))

            numUnfilteredFiles = len(fileNames)
            filesAndConfigs = []

            if fileNames and self._valid_folder(folderName):

                # Get the current set of active config filters
                fileFilters, activeConfigs, configPath = self._configStack.get_configuration(folderName)

                # Filter out files by options and config items
                filesToProcess = self._get_files_to_process(folderName, fileNames, fileFilters, configPath)

                # Create list of tuples with fileName and configEntrys for each file
                for fileName, fileFilter in filesToProcess:
                    configEntrys = self._get_configs_for_file(fileName, fileFilter, activeConfigs, configPath)
                    filesAndConfigs.append((fileName, configEntrys))

            # For delta measure create a fully qualified delta path name
            # Note when we split on path to measure, it will start with seperator
            deltaFolder = None
            if self._deltaPath is not None:
                deltaFolder = self._deltaPath + folderName[len(pathToMeasure):]

            # Call back to job with files and configs
            continueProcessing = self._add_files_to_job(
                        folderName,
                        deltaFolder,
                        filesAndConfigs,
                        numUnfilteredFiles)

            if not continueProcessing or not self._expandSubdirs:
                break

            # Remove any folders, and sort remaining to ensure consistent walk
            # order across file systems (for our testing if nothing else)
            self._remove_skip_dirs(folderName, childFolders)
            childFolders.sort()
示例#11
0
    def _stash_aggregates(self, filePath, analysisResults):
        '''
        As file results received, if requests to aggregate results, store 
        aggregate information.
        The aggreate functionality is based on names of items generated
        by specific csmodules; consider it a fatal error if what is
        requested for aggregation and what is present in analysisResults
        are out of sync
        '''
        # For each set of aggregates go through results and add
        # them to the appropriate aggregate set
        for aggKey, aggNames in self._aggregateNames.items():
            aggregateDict = self._aggregates.setdefault(aggKey, {})
            log.file(2, "Aggregating {} items in {}".format(len(analysisResults), aggKey))
            for result in analysisResults:
                # aggKey has the name for the value from results that we
                # will be keying the aggreate dictionary on
                try:
                    newKey = result[aggKey]
                except KeyError as e:
                    raise utils.InputException(STR_AggregateKeyError.format(str(e)))
                else:
                    aggregate = aggregateDict.setdefault(newKey, {'aggregate.count':0})

                    # Sepcific names can be provided to aggregate, or can do all
                    namesToAggregate = aggNames
                    if isinstance(aggNames, str):
                        if aggNames == 'all':
                            namesToAggregate = list(result.keys())

                    # Take each value from the result and aggregate according to type
                    for itemName in namesToAggregate:
                        self._aggregate_update(itemName, result[itemName], aggregate)

                    # Count the item
                    aggregate['aggregate.count'] += 1

                    # Update the aggregate
                    aggregateDict[newKey] = aggregate

            # The dictionary for this aggKey has been updated, so stash it
            self._aggregates[aggKey] = aggregateDict
示例#12
0
    def _is_file_survey_dupe(self, filePath, measures):
        '''
        Simple mechanism to identify duplicate and near-dupicate code by tracking
        a dictionary of file measures.  There are two modes:

        1) File Size: Build a dictionary in memory based on a hash of fileName
        and config info. In the hash buckets store a dict of file sizes for
        the first of each size seen that is not within the dupe threshold.
        If a file size within the threshold of an existing hashed size, treat
        it as a dupe and increment count for reporting.

        2) NBNC CRC: use the nbnc.crc measure to identify duplicates

        Note ASSUME the necessary file metadata will be present in the
        measures dicitonary, as basemodule.py puts it there for the Dupe option.
        '''
        firstDupeFilePath = None

        # 1) File name and Size check
        if isinstance(self._dupeThreshold, int):
            fileSize = int(measures[basemodule.METADATA_FILESIZE])
            dupeKey = (measures[basemodule.METADATA_FULLNAME] +
                        measures[basemodule.METADATA_CONFIG].replace(' ', ''))
            if dupeKey in self._dupeFileSurveys:
                for dupeFileSize, (fileCount, firstFilePath) in self._dupeFileSurveys[dupeKey].items():
                    if (dupeFileSize - self._dupeThreshold) <= fileSize and (
                            fileSize <= (dupeFileSize + self._dupeThreshold)):
                        firstDupeFilePath = firstFilePath
                        self._dupeFileSurveys[dupeKey][dupeFileSize] = (fileCount + 1, firstFilePath)
                        log.msg(1, "Dupe {} by {} of {} bytes: {}".format(
                                    fileCount, fileSize - dupeFileSize, fileSize, filePath))
                        break
            else:
                self._dupeFileSurveys[dupeKey] = {}

            if firstDupeFilePath is None:
                self._dupeFileSurveys[dupeKey][fileSize] = (1, filePath)
                log.file(2, "Added {} -- {} to dupe dictionary".format(dupeKey, fileSize))

        # 2) Code CRC check
        # Our relying on the nbnc.crc is brittle, because it is both a code and runtime
        # dependency on the Code csmodule being used. And there are valid scenarios
        # where nbnc.crc may not be present (e.g., skipping dupe file). Thus if the
        # measure isn't present, we fail silently
        else:
            fileCrc = None
            try:
                fileCrc = measures['nbnc.crc']
            except:
                log.file(2, "CRC Dupe - nbnc.crc missing: {}".format(filePath))
            if fileCrc in self._dupeFileSurveys:
                fileCount, firstDupeFilePath = self._dupeFileSurveys[fileCrc]
                self._dupeFileSurveys[fileCrc] = (fileCount + 1, firstDupeFilePath)
                log.msg(1, "Dupe {}: {} DUPE_OF {}".format(fileCount, filePath, firstDupeFilePath))
            elif fileCrc is not None:
                self._dupeFileSurveys[fileCrc] = (1, filePath)
                log.file(2, "Added {} -- {} to dupe dictionary".format(filePath, fileCrc))

        return firstDupeFilePath
示例#13
0
def _open_file(filePath, forceAll):
    """
    Manage the file opening with correct encoding based on any errors in 
    decoding utf-8 default and through inspection of file start.
    This isn't foolproof - files that use different encodings farther 
    in may blow up later if decoded, but that is rare.
    """

    # Use buffering to reduce the cost of open on larger files
    fileObj = open(filePath, 'r', buffering=FILE_BUFFERING, encoding='utf_8')

    # Grab the first bytes of the file
    start = None
    try:
        try:
            start = _get_file_start(fileObj, FILE_START_UTF8_CHECK)

        except UnicodeDecodeError as e:
            fileObj.close()
            log.file(1, "UTF-8 error, using binary: {}".format(filePath))
            fileObj = open(filePath, 'rb', buffering=FILE_BUFFERING)
            start = _get_file_start(fileObj, FILE_START_CHECK)

    except Exception as e2:
        log.msg(1, "Cannot open and read {}: {}".format(filePath, e2))
        fileObj.close()

    # Do tests that look at start of the file
    if start:
        keepFileOpen = forceAll
        if not forceAll:
            if _is_noncode_file(start):
                log.file(1, "Skipping, non-code start: {}".format(filePath))
            elif not filetype.is_text_file(start):
                log.file(1, "Skipping, binary char: {}".format(filePath))
            else:
                keepFileOpen = True
        if not keepFileOpen:
            fileObj.close()
            fileObj = None
    else:
        if fileObj:
            fileObj.close()
        fileObj = None

    return fileObj
示例#14
0
    def _survey_lines(self, linesToSurvey, params, measurements, analysis):
        '''
        Analyze file line by line. linesToSurvey is an iterable set of lines.
        Processing is driven by the regular expressions in member variables.
        The order of processing each line is:
             - Preprocess line string
             - Detect machine vs. human code
             - Detect blank lines
             - Detect single and multi-line comments
             - Capture line measures
             - Peform line processing (searches, routines, etc.)
        '''
        # Setup dictionary for measures and searches we'll do
        self._survey_start(params)

        # If no lines to process, may still want to output empty measures
        if linesToSurvey is None:
            linesToSurvey = []

        # Track whether inside a multi-line comment - ignore nesting
        scanningMultiLine = False

        for bufferLine in linesToSurvey:
            # Handle option of reading out binary files
            bufferLine = utils.safe_string(bufferLine)

            self.counts['RawLines'][self._activeBlock] += 1
            if self._logLevel: log.file(4, "Raw: {}".format(bufferLine))
            try:
                # Allow specializations to skip and/or special-case certain lines
                if self._alternate_line_processing(bufferLine):
                    continue

                # If line seperator, apply it
                lines = [bufferLine]
                if self.addLineSep is not None:
                    lines = bufferLine.split(self.addLineSep)

                #
                # Read through the lines to measure and process them one at a time
                # This is the main measure loop for csmodules derived from NBNC
                #
                for rawLine in lines:
                    self.counts['TotalLines'][self._activeBlock] += 1

                    # Allow for clean up of artifacts or other pre-processing
                    line = self._preprocess_line(rawLine)

                    # Detect true blank lines
                    if self.reTrueBlankLine.match(line):
                        self.counts['TrueBlankLines'][self._activeBlock] += 1
                        self._log_line(line, "T")
                        continue

                    # Block Detection
                    if len(self.blockDetectors) > 1:
                        if self._detect_block_change(line, analysis):
                            scanningMultiLine = False  # Don't allow multi-line comment to span blocks

                    # Determine comment state
                    # This is done before blank lines to consider multi-line
                    # comment syntax that will be counted as "blank", e.g., /* on it's own line
                    onCommentLine, scanningMultiLine = self._detect_line_comment(line, scanningMultiLine)

                    # Detect "blank" lines with no useful info
                    if self._detect_blank_line(line):
                        continue

                    # Measure and analyze -- overriden in derived classes
                    self._measure_line(line, onCommentLine)
                    self._analyze_line(line, analysis, onCommentLine)

            except Exception as e:
                log.stack()
                if self.stopOnError:
                    raise utils.FileMeasureError(
                        "Problem processing line: {} with module: {}\n{}".format(
                        str(sum(self.counts['RawLines'])), self.__class__.__name__, str(e)))

        # Package results
        self._survey_end(measurements, analysis)