Exemplo n.º 1
0
def _file_match(fileName, fileFilter):
    '''
    Performs the match check of filename to filter
    In the case of blank detection, look for no extension
    Otherwise use regex comparison using cached version of either the
    re from fnmatch.translate or custom RE string provided in filter
    '''
    if BLANK_FILE_EXT == fileFilter:
        root, ext = os.path.splitext(fileName)
        filterMatch = ('' == ext and not root.startswith('.'))
    else:
        filterRe = None
        try:
            filterRe = _FilterCache[fileFilter]
        except KeyError:
            if fileFilter.startswith(CUSTOM_FILE_REGEX):
                filterRe = re.compile(fileFilter.replace(CUSTOM_FILE_REGEX, ''), RE_OPTIONS)
            else:
                filterRe = re.compile(fnmatch.translate(fileFilter), RE_OPTIONS)
            _FilterCache[fileFilter] = filterRe

        filterMatch = filterRe.match(fileName)

        if trace.level() and filterMatch is None:
            trace.file(3, "FilterExtFilter: %s, no match:  %s" % (filterRe.pattern[:10], fileName))

        return filterMatch is not None
Exemplo n.º 2
0
def is_text_file(fileObject):

    textChars = string.letters + string.digits + string.punctuation + string.whitespace
    bytesToCheck = 128  # Big enough window to grab, but small for speed
    startPoint = 4  # Skip start of file, for hidden text codes
    minWindowSize = 32  # Get a big enough min window to be feasible
    nonTextThreshold = 0.2  # Have some tolerance to avoid false positives

    # Grab the first bytes of the file, STRIPPING NULLS (for unicode text files)
    fileBytes = utils.strip_null_chars(
        utils.get_file_start(fileObject, bytesToCheck))

    # Special case for PDF that looks like text but isn't
    if is_pdf_file(fileObject):
        isBelowThreshold = False
        trace.file(
            3, "   IsTextFile({0}): {1} ==> PDF File detected".format(
                isBelowThreshold, os.path.basename(fileObject.name)))
    else:
        isBelowThreshold = utils.check_bytes_below_threshold(
            fileBytes, textChars, minWindowSize, startPoint, nonTextThreshold)
        trace.file(
            3, "   IsTextFile({0}): {1} ==> {2}".format(
                isBelowThreshold, os.path.basename(fileObject.name),
                fileBytes))

    return isBelowThreshold
Exemplo n.º 3
0
    def _run(self):
        # We keep processing queue until the job signals it is done and
        # the queue is empty, or we receive an abort command
        while self._continue_processing():
            try:
                if self._workDone and self._outQueue.empty():
                    break
                filesOutput = self._outQueue.get_nowait()

            except Empty:
                trace.cc(3, "EMPTY OUTPUT")
                time.sleep(OUTPUT_EMPTY_WAIT)
            else:
                self.taskPackagesReceived += 1
                trace.cc(2, "GOT {0} measures".format(len(filesOutput)))

                # We get a set of output for multiple files with each
                # outputQueue item. Each file has a set of output
                # and potential errors that we pack to the app
                for filePath, outputList, errorList in filesOutput:

                    # Synchronus callback to applicaiton
                    # Output writing and screen update occurs in this call
                    self._file_measure_callback(filePath, outputList, errorList)

                    if errorList:
                        trace.file(1, "ERROR measuring: {0}".format(filePath))
                        self._controlQueue.put_nowait(('JOB', 'ERROR', filePath))
Exemplo n.º 4
0
    def _valid_folder(self, folderName):
        '''
        Is this folder one we should process?
        '''
        if not self._skipFolders and not self._includeFolders:
            return True

        validFolder = True

        # First verfiy this folder is not to be skipped
        if self._skipFolders:
            _root, currentFolder = os.path.split(folderName)
            for folderPattern in self._skipFolders:
                if fnmatch.fnmatch(currentFolder, folderPattern):
                    trace.file(1, "Skiping folder: %s" % folderName)
                    validFolder = False
                    break

        # Next verify if it is on the include list
        if validFolder and self._includeFolders:
            includeMatch = False
            for folderPattern in self._includeFolders:
                if fnmatch.fnmatch(folderName, folderPattern):
                    includeMatch = True
                    break
            if not includeMatch:
                trace.file(1, "Excluding folder: %s" % folderName)
                validFolder = False

        return validFolder
Exemplo n.º 5
0
def is_noncode_ext(filePath):
    rv = False
    if is_compressed_ext(filePath) or _has_ext(filePath,
                                               NonCodeFileExtensions):
        rv = True
        trace.file(3, "   NonCodeExt:  {0}".format(os.path.basename(filePath)))
    return rv
Exemplo n.º 6
0
    def file_measured_callback(self, filePath, outputList, errorList):
        '''
        Job output thread callback to provide file measurements.
        A list of output and potential errors is provided for each file.
        Called ONCE for each file in the job; if there were multiple
        config entries for the file, outputList will have multiple items.
        '''
        self._numFilesProcessed += 1
        self._errorList.extend(errorList)

        fileTime = 0
        fileMeasured = False
        for measures, analysisResults in outputList:
            trace.file(2, "Callback: {0} -- {1}".format(filePath, measures))
            if measures.items():
                # Zero out dupe measures in place
                if self._dupeTracking:
                    self._filter_dupes(filePath, measures, analysisResults)

                # Send results to metrics writer
                fileMeasured = True
                self._numMeasures += max(1, len(analysisResults))
                if not self._summaryOnly:
                    self._writer.write_items(measures, analysisResults)

                # Capture summary metrics and aggregates
                self._stash_summary_metrics(filePath, measures, analysisResults)
                self._stash_aggregates(filePath, analysisResults)

                fileTime += utils.safe_dict_get_float(measures, basemodule.METADATA_TIMING)

        self._numFilesMeasured += (1 if fileMeasured else 0)
        self._display_file_progress(filePath, fileTime)
        self._display_feedback()
Exemplo n.º 7
0
def is_noncode_file(fileObject):
    maxWindowSize = 30
    fileStart = utils.get_file_start(fileObject, maxWindowSize)
    phraseFound = utils.check_start_phrases(fileStart, NonCodeFileStart)
    trace.file(3, "   NonCodeFileStart({0}): {1} ==> {2}".format(
            phraseFound, fileStart, os.path.basename(fileObject.name)))
    return phraseFound is not None
Exemplo n.º 8
0
    def _valid_folder(self, folderName):
        '''
        Is this folder one we should process?
        '''
        if not self._skipFolders and not self._includeFolders:
            return True

        validFolder = True

        # First verfiy this folder is not to be skipped
        if self._skipFolders:
            _root, currentFolder = os.path.split(folderName)
            for folderPattern in self._skipFolders:
                if fnmatch.fnmatch(currentFolder, folderPattern):
                    trace.file(1, "Skiping folder: {0}".format(folderName))
                    validFolder = False
                    break

        # Next verify if it is on the include list
        if validFolder and self._includeFolders:
            includeMatch = False
            for folderPattern in self._includeFolders:
                if fnmatch.fnmatch(folderName, folderPattern):
                    includeMatch = True
                    break
            if not includeMatch:
                trace.file(1, "Excluding folder: {0}".format(folderName))
                validFolder = False

        return validFolder
Exemplo n.º 9
0
 def _open_file(self, filename):
     MeasureWriter._open_file(self, filename)
     filePath = os.path.join(self._outDir, filename)
     outFile = file(filePath, 'w')
     doc = minidom.Document()
     outFile.write(doc.toprettyxml())
     trace.file(2, "Opened XML Output File: {0}".format(filePath))
     return outFile
Exemplo n.º 10
0
def is_noncode_file(fileObject):
    maxWindowSize = 30
    fileStart = utils.get_file_start(fileObject, maxWindowSize)
    phraseFound = utils.check_start_phrases(fileStart, NonCodeFileStart)
    trace.file(
        3, "   NonCodeFileStart({0}): {1} ==> {2}".format(
            phraseFound, fileStart, os.path.basename(fileObject.name)))
    return phraseFound is not None
Exemplo n.º 11
0
 def _open_file(self, filename):
     MeasureWriter._open_file(self, filename)
     filePath = os.path.join(self._outDir, filename)
     outFile = file(filePath, 'w')
     doc = minidom.Document()
     outFile.write(doc.toprettyxml())
     trace.file(2, "Opened XML Output File: {0}".format(filePath))
     return outFile
Exemplo n.º 12
0
 def _open_file(self, fileName):
     MeasureWriter._open_file(self, fileName)
     filePath = os.path.join(self._outDir, fileName)
     self._rawFiles[fileName] = file(filePath, 'wb')
     outWriter = csv.writer(
         self._rawFiles[fileName], delimiter=self._delimiter, quoting=csv.QUOTE_NONNUMERIC)
     trace.file(2, "Opened Delimited Output File: {0}".format(filePath))
     return outWriter
Exemplo n.º 13
0
 def _open_file(self, fileName):
     MeasureWriter._open_file(self, fileName)
     filePath = os.path.join(self._outDir, fileName)
     self._rawFiles[fileName] = file(filePath, 'wb')
     outWriter = csv.writer(self._rawFiles[fileName],
                            delimiter=self._delimiter,
                            quoting=csv.QUOTE_NONNUMERIC)
     trace.file(2, "Opened Delimited Output File: {0}".format(filePath))
     return outWriter
Exemplo n.º 14
0
 def file_measured_callback(self, filePath, measures, analysisResults):
     '''
     Callback from the masurement module
     We store up a list of tuples with the work output for a given file
     '''
     assert filePath == self._currentFilePath, "Measure callback out of sync"
     trace.cc(3, "_file_measured_callback: {0}".format(filePath))
     trace.file(3, "  measures: {0}".format(measures))
     trace.file(3, "  analysis: {0}".format(analysisResults))
     self._currentFileOutput.append((measures, analysisResults))
Exemplo n.º 15
0
 def file_measured_callback(self, filePath, measures, analysisResults):
     '''
     Callback from the masurement module
     We store up a list of tuples with the work output for a given file
     '''
     assert filePath == self._currentFilePath, "Measure callback out of sync"
     trace.cc(3, "_file_measured_callback: {0}".format(filePath))
     trace.file(3, "  measures: {0}".format(measures))
     trace.file(3, "  analysis: {0}".format(analysisResults))
     self._currentFileOutput.append((measures, analysisResults))
Exemplo n.º 16
0
    def _remove_skip_dirs(self, root, dirs):
        '''
        Decide what children dirs should be skipped
        Filter out dirs in place (vs a copy), so os.walk will skip
        '''
        dirsToRemove = []
        for folderPattern in self._skipFolders:
            dirsToRemove += fnmatch.filter(dirs, folderPattern)
        dirsToRemove = set(dirsToRemove)

        for folder in dirsToRemove:
            trace.file(1, "Skiping over: %s\\%s" % (root, folder))
            dirs.remove(folder)
Exemplo n.º 17
0
    def _remove_skip_dirs(self, root, dirs):
        '''
        Decide what children dirs should be skipped
        Filter out dirs in place (vs a copy), so os.walk will skip
        '''
        dirsToRemove = []
        for folderPattern in self._skipFolders:
            dirsToRemove += fnmatch.filter(dirs, folderPattern)
        dirsToRemove = set(dirsToRemove)

        for folder in dirsToRemove:
            trace.file(1, "Skiping over: {0}\\{1}".format(root, folder))
            dirs.remove(folder)
Exemplo n.º 18
0
def _has_ext(filePath, extensions):
    '''
    Does file have an extension, stripping off any numeric only extensions
    '''
    fileExt = None
    while True:
        (base, extension) = os.path.splitext(filePath)
        fileExt = str(extension).strip('.')
        if fileExt.isdigit():
            filePath = base
        else:
            break
    trace.file(3, "   File Extension:  {0}".format(fileExt))
    return fileExt.lower() in extensions
Exemplo n.º 19
0
    def _get_files_to_process(self, folderName, fileNames, fileFilters, configPath):
        '''
        Filter the list of files based on command line options and active
        config file filters
        '''
        # if fileFilters is empty it means an empty config file, so skip all files
        if not fileFilters:
            return []

        # Optimize the most common matching of extensions by creating cache of
        # simple '*.xxx' extensions from config filters for each config file
        filterExts = []
        try:
            filterExts = self._configFilterCache[configPath]
        except KeyError:
            filterSplits = [os.path.splitext(fileFilter) for fileFilter in fileFilters if
                                os.path.splitext(fileFilter)[0] == '*']
            filterExts = [ext for _root, ext in filterSplits]
            self._configFilterCache[configPath] = filterExts

        # Select files based on matching filters
        filesToProcess = []
        for fileName in fileNames:

            # Filter file list by command-line postive filter, if provided
            if fileext.file_matches_filters(fileName, self._fileExtFilters):

                # Optimize most common case of direct match of file extension, then
                # fall back to doing a full filter match on config file filter
                _root, fileExt = os.path.splitext(fileName)
                fileFilter = None
                if fileExt in filterExts:
                    fileFilter = '*' + fileExt
                else:
                    fileFilter = fileext.file_matches_filters(fileName, fileFilters)
                if fileFilter is not None:
                    filesToProcess.append((fileName, fileFilter))

        # Remove files that should be skipped
        if self._skipFiles:
            filesToProcess = [(fileName, fileFilter) for fileName, fileFilter in filesToProcess if
                                not fileext.file_matches_filters(fileName, self._skipFiles)]

        # Debug tracing of files that were not measured
        if trace.level():
            filesSkipped = set(fileNames) - set([f for f, _filter in filesToProcess])
            if filesSkipped:
                trace.file(2, "SkippingFiles: %s" % filesSkipped)

        return filesToProcess
Exemplo n.º 20
0
    def _measure_text(self, fileObject, measurements):
        '''
        Default handler For text based files, go through each file line
        '''
        if self._traceLevel: trace.file(4, "Document: {0}".format(fileObject))
        for rawLine in fileObject:
            self.totalLines += 1
            line = utils.strip_null_chars(rawLine)

            # Detect blank lines
            if self.reBlankLine.match(line):
                self.blankLines += 1
                continue

            # Content line
            self.contentLines += 1
Exemplo n.º 21
0
def is_text_file(fileObject):

    textChars = string.letters + string.digits + string.punctuation + string.whitespace
    bytesToCheck = 128          # Big enough window to grab, but small for speed
    startPoint = 4              # Skip start of file, for hidden text codes
    minWindowSize = 32          # Get a big enough min window to be feasible
    nonTextThreshold = 0.2      # Have some tolerance to avoid false positives

    # Grab the first bytes of the file, STRIPPING NULLS (for unicode text files)
    fileBytes = utils.strip_null_chars(utils.get_file_start(fileObject, bytesToCheck))

    isBelowThreshold = utils.check_bytes_below_threshold(
            fileBytes, textChars, minWindowSize, startPoint, nonTextThreshold)
    trace.file(3,"   IsTextFile({0}): {1} ==> {2}".format(
            isBelowThreshold, os.path.basename(fileObject.name), fileBytes))
    return isBelowThreshold
Exemplo n.º 22
0
    def walk(self, pathToMeasure):
        '''
        Walk folders while filtering sending updates via callback
        We may be asked to terminate in our callback
        '''
        self._configStack.set_measure_root(pathToMeasure)

        for folderName, childFolders, fileNames in os.walk(pathToMeasure,
                                                           topdown=True):
            trace.file(2, "Scanning: {0}".format(folderName))

            numUnfilteredFiles = len(fileNames)
            filesAndConfigs = []

            if fileNames and self._valid_folder(folderName):

                # Get the current set of active config filters
                fileFilters, activeConfigs, configPath = self._configStack.get_configuration(
                    folderName)

                # Filter out files by options and config items
                filesToProcess = self._get_files_to_process(
                    folderName, fileNames, fileFilters, configPath)

                # Create list of tuples with fileName and configEntrys for each file
                for fileName, fileFilter in filesToProcess:
                    configEntrys = self._get_configs_for_file(
                        fileName, fileFilter, activeConfigs, configPath)
                    filesAndConfigs.append((fileName, configEntrys))

            # For delta measure create a fully qualified delta path name
            # Note when we split on path to measure, it will start with seperator
            deltaFolder = None
            if self._deltaPath is not None:
                deltaFolder = self._deltaPath + folderName[len(pathToMeasure):]

            # Call back to job with files and configs
            continueProcessing = self._add_files_to_job(
                folderName, deltaFolder, filesAndConfigs, numUnfilteredFiles)

            if not continueProcessing or not self._expandSubdirs:
                break

            # Remove any folders, and sort remaining to ensure consistent walk
            # order across file systems (for our testing if nothing else)
            self._remove_skip_dirs(folderName, childFolders)
            childFolders.sort()
Exemplo n.º 23
0
    def walk(self, pathToMeasure):
        '''
        Walk folders while filtering sending updates via callback
        We may be asked to terminate in our callback
        '''
        self._configStack.set_measure_root(pathToMeasure)

        for folderName, childFolders, fileNames in os.walk(pathToMeasure, topdown=True):
            trace.file(2, "Scanning: {0}".format(folderName))

            numUnfilteredFiles = len(fileNames)
            filesAndConfigs = []

            if fileNames and self._valid_folder(folderName):

                # Get the current set of active config filters
                fileFilters, activeConfigs, configPath = self._configStack.get_configuration(folderName)

                # Filter out files by options and config items
                filesToProcess = self._get_files_to_process(folderName, fileNames, fileFilters, configPath)

                # Create list of tuples with fileName and configEntrys for each file
                for fileName, fileFilter in filesToProcess:
                    configEntrys = self._get_configs_for_file(fileName, fileFilter, activeConfigs, configPath)
                    filesAndConfigs.append((fileName, configEntrys))

            # For delta measure create a fully qualified delta path name
            # Note when we split on path to measure, it will start with seperator
            deltaFolder = None
            if self._deltaPath is not None:
                deltaFolder = self._deltaPath + folderName[len(pathToMeasure):]

            # Call back to job with files and configs
            continueProcessing = self._add_files_to_job(
                        folderName,
                        deltaFolder,
                        filesAndConfigs,
                        numUnfilteredFiles)

            if not continueProcessing or not self._expandSubdirs:
                break

            # Remove any folders, and sort remaining to ensure consistent walk
            # order across file systems (for our testing if nothing else)
            self._remove_skip_dirs(folderName, childFolders)
            childFolders.sort()
Exemplo n.º 24
0
    def _measure_file(self, workItem):
        '''
        Unpack workItem and run all measures requested by the configItems
        for the file
        '''
        (   path,
            deltaPath,
            fileName,
            configItems,
            options,
            numFilesInFolder
            ) = workItem

        self._currentFilePath = os.path.join(path, fileName)
        trace.file(3, "Processing: {0}".format(self._currentFilePath))

        deltaFilePath = None
        if deltaPath is not None:
            deltaFilePath = os.path.join(deltaPath, fileName)

        continueProcessing = True
        try:
            for configItem in configItems:
                if self._check_for_stop():
                    break

                self._open_file(configItem.module, deltaFilePath)

                #
                # Synchronus delegation to the measure module defined in the config file
                #
                configItem.module.process_file(
                        self._currentFilePath,
                        self._currentFileIterator,
                        configItem,
                        numFilesInFolder,
                        self.file_measured_callback)

        except utils.FileMeasureError, e:
            trace.traceback(2)
            self._currentFileErrors.append(
                    uistrings.STR_ErrorMeasuringFile.format(self._currentFilePath, str(e)))
            continueProcessing = not options.breakOnError
Exemplo n.º 25
0
    def _stash_aggregates(self, filePath, analysisResults):
        '''
        As we receive results for files, if we have requests to aggregate
        results, store away aggregate information.
        The aggreate functionality is based on names of items generated
        by specific csmodules; we consider it a fatal error if what is
        requested for aggregation and what is present in analysisResults
        are out of sync
        '''
        # For each set of aggregates we go through results and add
        # them to the appropriate aggregate set
        for aggKey, aggNames in self._aggregateNames.iteritems():
            aggregateDict = self._aggregates.setdefault(aggKey, {})
            trace.file(2, "Aggregating {0} items in {1}".format(len(analysisResults), aggKey))
            for result in analysisResults:
                # aggKey has the name for the value from results that we
                # will be keying the aggreate dictionary on
                try:
                    newKey = result[aggKey]
                except KeyError, e:
                    raise utils.InputException(STR_AggregateKeyError.format(str(e)))
                else:
                    aggregate = aggregateDict.setdefault(newKey, {'aggregate.count':0})

                    # Sepcific names can be provided to aggregate, or can do all
                    namesToAggregate = aggNames
                    if isinstance(aggNames, basestring):
                        if aggNames == 'all':
                            namesToAggregate = result.keys()

                    # Take each value from the result and aggregate according to type
                    for itemName in namesToAggregate:
                        self._aggregate_update(itemName, result[itemName], aggregate)

                    # Count the item
                    aggregate['aggregate.count'] += 1

                    # Update the aggregate
                    aggregateDict[newKey] = aggregate

            # The dictionary for this aggKey has been updated, so stash it
            self._aggregates[aggKey] = aggregateDict
Exemplo n.º 26
0
    def _get_delta_lines(self, filePath, deltaFilePath):
        '''
        Return a line buffer that represents additional lines relative to the
        delta path. We are not doing a full diff, only taking into account new
        files, and lines in existing files that are new/modified.
        '''
        self._deltaFilePath = deltaFilePath
        deltaLines = None
        # If no correpsonding file exists in delta, we do a normal file open
        if not os.path.exists(deltaFilePath):
            trace.file(1, "Delta file doesn't exist for: {0}".format(deltaFilePath))
            deltaLines = self._open_file(filePath)

        # We only do a diff if there is an identical file name that has been modified
        elif not filecmp.cmp(deltaFilePath, filePath):
            fileToMeasure = self._open_file(filePath)
            if fileToMeasure is not None:
                measureFileLines = fileToMeasure.readlines()
                fileToMeasure.close()
                deltaFileLines = None
                with open(deltaFilePath, 'rU') as deltaFile:
                    deltaFileLines = deltaFile.readlines()
                diffLines = difflib.unified_diff(deltaFileLines, measureFileLines)
                if diffLines:
                    deltaLines = []
                    for line in diffLines:
                        if line.startswith('+') or (self._deltaIncludeDeleted and line.startswith('-')):
                           deltaLines.append(line[2:])
            trace.file(1, "{0} delta lines with: {1}".format(len(deltaLines), deltaFilePath))
        else:
            trace.file(1, "Delta skip: {0} == {1}".format(filePath, deltaFilePath))
        return deltaLines
Exemplo n.º 27
0
    def _is_file_survey_dupe(self, filePath, measures):
        '''
        Simple mechanism to identify duplicate and near-dupicate code by tracking
        a dictionary of files we see as measures.  There are two modes:

        1) File Size: Build a dictionary in memory based on a hash of fileName
        and config info. In the hash buckets we store a dict of file sizes for
        the first of each size we see that is not within the dupe threshold.
        If we see a file size within the threshold of one of our existing
        hashed sizes, we treat it as a dupe and increment count for reporting.

        2) NBNC CRC: We use the nbnc.crc measure to identify duplicates

        Note that we ASSUME the necessary file metadata will be present in the
        measures dicitonary, as basemodule.py puts it there for the Dupe option.
        '''
        firstDupeFilePath = None

        # 1) File name and Size check
        if isinstance(self._dupeThreshold, int):
            fileSize = int(measures[basemodule.METADATA_FILESIZE])
            dupeKey = (measures[basemodule.METADATA_FULLNAME] +
                        measures[basemodule.METADATA_CONFIG].replace(' ', ''))
            if dupeKey in self._dupeFileSurveys:
                for dupeFileSize, (fileCount, firstFilePath) in self._dupeFileSurveys[dupeKey].iteritems():
                    if (dupeFileSize - self._dupeThreshold) <= fileSize and (
                            fileSize <= (dupeFileSize + self._dupeThreshold)):
                        firstDupeFilePath = firstFilePath
                        self._dupeFileSurveys[dupeKey][dupeFileSize] = (fileCount + 1, firstFilePath)
                        trace.msg(1, "Dupe {0} by {1} of {2} bytes: {3}".format(
                                    fileCount, fileSize - dupeFileSize, fileSize, filePath))
                        break
            else:
                self._dupeFileSurveys[dupeKey] = {}

            if firstDupeFilePath is None:
                self._dupeFileSurveys[dupeKey][fileSize] = (1, filePath)
                trace.file(2, "Added {0} -- {1} to dupe dictionary".format(dupeKey, fileSize))

        # 2) Code CRC check
        # Our relying on the nbnc.crc is brittle, because it is both a code and runtime
        # dependency on the Code csmodule being used. And there are valid scenarios
        # where nbnc.crc may not be present (e.g., skipping dupe file). Thus if the
        # measure isn't present, we fail silently
        else:
            fileCrc = None
            try:
                fileCrc = measures['nbnc.crc']
            except:
                trace.file(2, "CRC Dupe - nbnc.crc missing: {0}".format(filePath))
            if fileCrc in self._dupeFileSurveys:
                fileCount, firstDupeFilePath = self._dupeFileSurveys[fileCrc]
                self._dupeFileSurveys[fileCrc] = (fileCount + 1, firstDupeFilePath)
                trace.msg(1, "Dupe {0}: {1} DUPE_OF {2}".format(fileCount, filePath, firstDupeFilePath))
            elif fileCrc is not None:
                self._dupeFileSurveys[fileCrc] = (1, filePath)
                trace.file(2, "Added {0} -- {1} to dupe dictionary".format(filePath, fileCrc))

        return firstDupeFilePath
Exemplo n.º 28
0
    def _measure_file(self, workItem):
        '''
        Unpack workItem and run all measures requested by the configItems
        for the file
        '''
        (path, deltaPath, fileName, configItems, options,
         numFilesInFolder) = workItem

        self._currentFilePath = os.path.join(path, fileName)
        trace.file(1, "Processing: {0}".format(self._currentFilePath))

        deltaFilePath = None
        if deltaPath is not None:
            deltaFilePath = os.path.join(deltaPath, fileName)

        continueProcessing = True
        try:
            for configItem in configItems:
                if self._check_for_stop():
                    break

                self._open_file(configItem.module, deltaFilePath)

                #
                # Synchronus delegation to the measure module defined in the config file
                #
                configItem.module.process_file(self._currentFilePath,
                                               self._currentFileIterator,
                                               configItem, numFilesInFolder,
                                               self.file_measured_callback)

        except utils.FileMeasureError, e:
            trace.traceback(2)
            self._currentFileErrors.append(
                uistrings.STR_ErrorMeasuringFile.format(
                    self._currentFilePath, str(e)))
            continueProcessing = not options.breakOnError
Exemplo n.º 29
0
    def _open_file(self, filePath, oldFileHandle=None):
        '''
        Return the requested fileObject if criteria are met
        '''
        tryToOpen = True
        newFileHandle = None

        # Check for extensions
        if self._ignoreNonCode and (filetype.is_noncode_ext(filePath)):
            trace.file(1, "Skipping, non-code ext: {0}".format(filePath))
            tryToOpen = False
        # Check for size threshold
        elif self._sizeThreshold > 0:
            fileSize = utils.get_file_size(filePath)
            if self._sizeThreshold < fileSize:
                trace.file(1, "Skipping, size {0}: {1}".format(fileSize, filePath))
                tryToOpen = False

        if tryToOpen:
            # Open the file if it hasn't been opened, otherwise reset it
            if not oldFileHandle:
                # Use a universal open with line buffering to support binary files and
                # reduce the cost of open on larger files
                newFileHandle = open(filePath, 'rU', 1)
            else:
                newFileHandle = oldFileHandle
                newFileHandle.seek(0)    # Reset the file

            # Do tests that look at start of the file
            keepFileOpen = False
            if self._ignoreNonCode and filetype.is_noncode_file(newFileHandle):
                trace.file(1, "Skipping, non-code start: {0}".format(filePath))
            elif self._ignoreBinary and not filetype.is_text_file(newFileHandle):
                trace.file(1, "Skipping, binary char: {0}".format(filePath))
            else:
                keepFileOpen = True
            if not keepFileOpen:
                # If we were NOT passed an existing file handle, close what we opened
                if not oldFileHandle and newFileHandle:
                    newFileHandle.close()
                newFileHandle = None

        return newFileHandle
Exemplo n.º 30
0
    def process_file(self, filePath, fileLines,
                        configEntry,
                        numSameFiles,
                        file_measured_callback):
        '''
        Inherited modules use the default implementation of process_file
        to handle calling _survey and packaging results, including any
        file metadata
        '''
        utils.timing_set('FILE_MEASURE_TIME')
        trace.file(2, "process_file: {0} {1}".format(self.__class__.__name__, filePath))
        trace.file(3, "  config: {0}".format(str(configEntry)))

        # Stash path for error handling in derived classes
        self._currentPath = utils.SurveyorPathParser(filePath)

        # Does the config measure filter need to be overridden?
        if self._measureFilter is not None:
            configEntry.new_measure_filter(self._measureFilter)

        # Measurements (whole file metrics) will be stored in a dictionary
        # Pack measurement data with file metadata
        measurements = {}
        self._pack_metadata_into_measures(configEntry, numSameFiles, measurements)

        # Analysis items (per line items) are a list of dictionaries
        analysis = []

        #
        # Delegate the survey work to specializations
        #
        measureResults = {}
        analysisResults = []
        if self._survey(fileLines, configEntry, measurements, analysis):

            # Pack measurements that match our measure filter
            for measureName, measure in measurements.iteritems():
                if self.match_measure(measureName, configEntry.measureFilters):
                    measureResults[measureName] = measure

            # Pack analysis items into a list of dictionaries for return to app
            # We only send analysis items that match filter
            for analysisItem in analysis:
                analysisRow = {}
                for itemName, itemValue in analysisItem.iteritems():
                    if self.match_measure(itemName, configEntry.measureFilters):
                        analysisRow[itemName] = itemValue
                if analysisRow:
                    analysisResults.append(analysisRow)

            # If this is a delta comparison and there are no lines, it means the
            # delta file is an exact dupe
            if not fileLines and self._deltaFilePath:
                measureResults[METADATA_DUPE_PATH] = self._deltaFilePath

            # Add timing info
            if self.match_measure(METADATA_TIMING, configEntry.measureFilters):
                measureResults[METADATA_TIMING] = "{0:.4f}".format(utils.timing_get('FILE_MEASURE_TIME'))

        self._currentPath = None
        self._deltaFilePath = None

        # Send data back to the caller (jobworker.Worker in default framework)
        file_measured_callback(filePath, measureResults, analysisResults)
Exemplo n.º 31
0
    def _survey_lines(self, linesToSurvey, params, measurements, analysis):
        '''
        Analyze file line by line. linesToSurvey is an iterable set of lines.
        Processing is driven by the regular expressions in member variables.
        The order of processing each line is:
             - Preprocess line string
             - Detect machine vs. human code
             - Detect blank lines
             - Detect single and multi-line comments
             - Capture line measures
             - Peform line processing (searches, routines, etc.)
        '''
        # Setup dictionary for measures and searches we'll do
        self._survey_start(params)

        # If no lines to process, we may still want to output empty measures
        if linesToSurvey is None:
            linesToSurvey = []

        # Track whether we are inside a multi-line comment - we ignore nesting
        scanningMultiLine = False

        # If we have a line seperator, apply it
        for bufferLine in linesToSurvey:
            self.counts['RawLines'][self._activeBlock] += 1
            if self._traceLevel: trace.file(4, "Raw: {0}".format(bufferLine))

            # Allow specializations to special-case certain lines
            if self._alternate_line_processing(bufferLine):
                continue

            lines = [bufferLine]
            if self.addLineSep is not None:
                lines = bufferLine.split(self.addLineSep)

            #
            # Read through the file lines and process them one at a time
            # This is the main processing loop for all csmodules derived from NBNC
            #
            try:
                for rawLine in lines:
                    self.counts['TotalLines'][self._activeBlock] += 1

                    # Allow for clean up of artifacts or other pre-processing
                    line = self._preprocess_line(rawLine)

                    # Detect true blank lines
                    if self.reTrueBlankLine.match(line):
                        self.counts['TrueBlankLines'][self._activeBlock] += 1
                        self._trace_line(line, "T")
                        continue

                    # Block Detection
                    if len(self.blockDetectors) > 1:
                        if self._detect_block_change(line, analysis):
                            scanningMultiLine = False  # Don't allow multi-line comment to span blocks

                    # Determine comment state
                    # This is done before blank lines to make sure we consider multi-line
                    # comment syntax that will be counted as "blank", e.g., /* on it's own line
                    onCommentLine, scanningMultiLine = self._detect_line_comment(
                        line, scanningMultiLine)

                    # Detect blank lines
                    if self._detect_blank_line(line):
                        continue

                    # Measure and analyze -- overriden in derived classes
                    self._measure_line(line, onCommentLine)
                    self._analyze_line(line, analysis, onCommentLine)

            except Exception, e:
                trace.traceback()
                raise utils.FileMeasureError(
                    "Problem processing line: {0} with module: {1}\n{2}".
                    format(str(sum(self.counts['RawLines'])),
                           self.__class__.__name__, str(e)))
Exemplo n.º 32
0
    def _survey_lines(self, linesToSurvey, params, measurements, analysis):
        '''
        Analyze file line by line. linesToSurvey is an iterable set of lines.
        Processing is driven by the regular expressions in member variables.
        The order of processing each line is:
             - Preprocess line string
             - Detect machine vs. human code
             - Detect blank lines
             - Detect single and multi-line comments
             - Capture line measures
             - Peform line processing (searches, routines, etc.)
        '''
        # Setup dictionary for measures and searches we'll do
        self._survey_start(params)

        # If no lines to process, we may still want to output empty measures
        if linesToSurvey is None:
            linesToSurvey = []

        # Track whether we are inside a multi-line comment - we ignore nesting
        scanningMultiLine = False

        # Loop through the raw lines we were passed
        for bufferLine in linesToSurvey:
            self.counts['RawLines'][self._activeBlock] += 1
            if self._traceLevel: trace.file(4, "Raw: {0}".format(bufferLine))

            # Allow specializations to skip and/or special-case certain lines
            if self._alternate_line_processing(bufferLine):
                continue

            # If we have a line seperator, apply it
            lines = [bufferLine]
            if self.addLineSep is not None:
                lines = bufferLine.split(self.addLineSep)

            #
            # Read through the lines to measure and process them one at a time
            # This is the main measure loop for csmodules derived from NBNC
            #
            try:
                for rawLine in lines:
                    self.counts['TotalLines'][self._activeBlock] += 1

                    # Allow for clean up of artifacts or other pre-processing
                    line = self._preprocess_line(rawLine)

                    # Detect true blank lines
                    if self.reTrueBlankLine.match(line):
                        self.counts['TrueBlankLines'][self._activeBlock] += 1
                        self._trace_line(line, "T")
                        continue

                    # Block Detection
                    if len(self.blockDetectors) > 1:
                        if self._detect_block_change(line, analysis):
                            scanningMultiLine = False  # Don't allow multi-line comment to span blocks

                    # Determine comment state
                    # This is done before blank lines to make sure we consider multi-line
                    # comment syntax that will be counted as "blank", e.g., /* on it's own line
                    onCommentLine, scanningMultiLine = self._detect_line_comment(line, scanningMultiLine)

                    # Detect "blank" lines with no useful info
                    if self._detect_blank_line(line):
                        continue

                    # Measure and analyze -- overriden in derived classes
                    self._measure_line(line, onCommentLine)
                    self._analyze_line(line, analysis, onCommentLine)

            except Exception, e:
                trace.traceback()
                raise utils.FileMeasureError(
                        "Problem processing line: {0} with module: {1}\n{2}".format(
                        str(sum(self.counts['RawLines'])), self.__class__.__name__, str(e)))
Exemplo n.º 33
0
def is_noncode_ext(filePath):
    rv = False
    if is_compressed_ext(filePath) or _has_ext(filePath, NonCodeFileExtensions):
        rv = True
        trace.file(3, "   NonCodeExt:  {0}".format(os.path.basename(filePath)))
    return rv