示例#1
0
def is_text_file(fileObject):

    textChars = string.letters + string.digits + string.punctuation + string.whitespace
    bytesToCheck = 128  # Big enough window to grab, but small for speed
    startPoint = 4  # Skip start of file, for hidden text codes
    minWindowSize = 32  # Get a big enough min window to be feasible
    nonTextThreshold = 0.2  # Have some tolerance to avoid false positives

    # Grab the first bytes of the file, STRIPPING NULLS (for unicode text files)
    fileBytes = utils.strip_null_chars(
        utils.get_file_start(fileObject, bytesToCheck))

    # Special case for PDF that looks like text but isn't
    if is_pdf_file(fileObject):
        isBelowThreshold = False
        trace.file(
            3, "   IsTextFile({0}): {1} ==> PDF File detected".format(
                isBelowThreshold, os.path.basename(fileObject.name)))
    else:
        isBelowThreshold = utils.check_bytes_below_threshold(
            fileBytes, textChars, minWindowSize, startPoint, nonTextThreshold)
        trace.file(
            3, "   IsTextFile({0}): {1} ==> {2}".format(
                isBelowThreshold, os.path.basename(fileObject.name),
                fileBytes))

    return isBelowThreshold
示例#2
0
 def _preprocess_line(self, line):
     '''
     Cut line down to the maximum allowed length
     Remove null characters that can occur with multibyte file formats
     Can be overriden if multibyte needs to be preserved
     '''
     return utils.strip_null_chars(line[:self.maxLineLength])
示例#3
0
文件: NBNC.py 项目: sjt1/codesurveyor
 def _preprocess_line(self, line):
     '''
     Cut line down to the maximum allowed length
     Remove null characters that can occur with multibyte file formats
     Can be overriden if multibyte needs to be preserved
     '''
     return utils.strip_null_chars(line[:self.maxLineLength])
示例#4
0
    def _measure_text(self, fileObject, measurements):
        '''
        Default handler For text based files, go through each file line
        '''
        for rawLine in fileObject:
            self.totalLines += 1
            line = utils.strip_null_chars(rawLine)

            # Detect blank lines
            if self.reBlankLine.match(line):
                self.blankLines += 1
                continue

            # Content line
            self.contentLines += 1
    def _alternate_line_processing(self, line):
        '''
        Check the start of the line to detect a binary part of a PBL file
        Some of this processing is really only for PBL files, but it will be harmless
        for exported files
        Return value to determine whether we do any more survreyor processing on line
        '''
        stopProcessingLine = True
        if super(customPowerBuilder, self)._alternate_line_processing(line):
            return stopProcessingLine

        line = utils.strip_null_chars(line)
        if self.reTrueBlankLine.match(line):
            return False

        isTextLine = True
        if self._isPblFile:
            line = self._clean_PB_tokens(line)

            windowSize = 80
            textChars = string.letters + string.digits + string.whitespace + '~$\\/-_<>=():*|;,\"'
            startPoint = 1
            minWindowSize = 15
            threshold = 0.2
            isTextLine = utils.check_bytes_below_threshold(
                line.lstrip()[:windowSize], textChars, minWindowSize,
                startPoint, threshold)

        line = utils.strip_extended_chars(line)

        if isTextLine:
            if self._is_generated_line(line):
                self.counts['PbGenLines'][self._activeBlock] += 1
                self._write_out_line('Gen', line)
            else:
                # At this point, we have a line of code we want to process
                # We separate out routines and tables from generic code
                stopProcessingLine = False
                if not self._process_PB_routine(line):
                    if self._in_PB_table(line):
                        self._write_out_line('Tables', line)
                    else:
                        self._write_out_line('Code', line)
        else:
            self.counts['PbBinLines'][self._activeBlock] += 1
            self._write_out_line('Bin', line)

        return stopProcessingLine
示例#6
0
def is_text_file(fileObject):

    textChars = string.letters + string.digits + string.punctuation + string.whitespace
    bytesToCheck = 128          # Big enough window to grab, but small for speed
    startPoint = 4              # Skip start of file, for hidden text codes
    minWindowSize = 32          # Get a big enough min window to be feasible
    nonTextThreshold = 0.2      # Have some tolerance to avoid false positives

    # Grab the first bytes of the file, STRIPPING NULLS (for unicode text files)
    fileBytes = utils.strip_null_chars(utils.get_file_start(fileObject, bytesToCheck))

    isBelowThreshold = utils.check_bytes_below_threshold(
            fileBytes, textChars, minWindowSize, startPoint, nonTextThreshold)
    trace.file(3,"   IsTextFile({0}): {1} ==> {2}".format(
            isBelowThreshold, os.path.basename(fileObject.name), fileBytes))
    return isBelowThreshold
示例#7
0
    def _measure_text(self, fileObject, measurements):
        '''
        Default handler For text based files, go through each file line
        '''
        if self._traceLevel: trace.file(4, "Document: {0}".format(fileObject))
        for rawLine in fileObject:
            self.totalLines += 1
            line = utils.strip_null_chars(rawLine)

            # Detect blank lines
            if self.reBlankLine.match(line):
                self.blankLines += 1
                continue

            # Content line
            self.contentLines += 1
示例#8
0
    def _search(self, lines, configEntry, measurements, analysis):
        '''
        Loop through the lines, comparing each aginst
        both the positve and negative list of search strings provided.
        '''
        positiveSearches, negativeSearches = self._setup_search_strings(
            configEntry.paramsProcessed)

        val_TotalHits = 0
        val_TotalLines = 0
        try:
            for rawLine in lines:
                line = utils.strip_null_chars(rawLine)
                val_TotalLines += 1

                matchTuple = self._first_match(line, positiveSearches,
                                               negativeSearches)
                if matchTuple:
                    origPatternStr, match = matchTuple
                    val_TotalHits += 1

                    # We may be searching binaries, so take some steps to clean up
                    # the line string we export
                    cleanSearchLine = line.strip()
                    cleanSearchLine = cleanSearchLine[:self.MAX_STR_LEN]
                    cleanSearchLine = utils.safe_ascii_string(cleanSearchLine)
                    cleanSearchLine = utils.strip_annoying_chars(
                        cleanSearchLine)

                    # Export the findings
                    analysisItem = {}
                    analysisItem[
                        self.SEARCH_LINE] = cleanSearchLine[:self.MAX_STR_LEN]
                    analysisItem[self.SEARCH_LINENUM] = val_TotalLines
                    analysisItem[self.SEARCH_CONFIG_RE] = origPatternStr
                    analysisItem[self.SEARCH_REGEXP] = utils.get_match_pattern(
                        match)[:self.MAX_STR_LEN]
                    analysisItem[self.SEARCH_MATCH] = utils.get_match_string(
                        match)[:self.MAX_STR_LEN]
                    analysis.append(analysisItem)

        except Exception, e:
            raise utils.CsModuleException(
                "Error {0}\n...searching line: {1}".format(
                    str(e), str(val_TotalLines)))
示例#9
0
    def _alternate_line_processing(self, line):
        '''
        Check the start of the line to detect a binary part of a PBL file
        Some of this processing is really only for PBL files, but it will be harmless
        for exported files
        Return value to determine whether we do any more survreyor processing on line
        '''
        stopProcessingLine = True
        if super(customPowerBuilder, self)._alternate_line_processing(line):
            return stopProcessingLine

        line = utils.strip_null_chars(line)
        if self.reTrueBlankLine.match(line):
            return False

        isTextLine = True
        if self._isPblFile:
            line = self._clean_PB_tokens(line)
            isTextLine = not utils.is_str_binary(line)

        line = utils.strip_extended_chars(line)

        if isTextLine:
            if self._is_generated_line(line):
                self.counts['MeasureLines'][self.MACHINE] += 1
                self._write_out_line('Gen', line)
            else:
                # At this point, we have a line of code we want to process
                # We separate out routines and tables from generic code
                stopProcessingLine = False
                if not self._process_PB_routine(line):
                    if self._in_PB_table(line):
                        self._write_out_line('Tables', line)
                    else:
                        self._write_out_line('Code', line)
        else:
            self.counts['PbBinLines'][self._activeBlock] += 1
            self._write_out_line('Bin', line)

        return stopProcessingLine
示例#10
0
    def _search_multi(self, lines, configEntry, measurements, analysis):
        '''
        Use multi-line searches
        '''
        # Make sure lines represents the text of the file
        try:
            lines = lines.read()
        except AttributeError:
            pass
        lines = utils.strip_null_chars(lines)

        positiveSearches, negativeSearches = self._setup_search_strings(
            configEntry.paramsProcessed)
        matchTuple = self._first_match(lines, positiveSearches,
                                       negativeSearches)
        if matchTuple:
            origPatternStr, match = matchTuple
            analysisItem = {}
            analysisItem[self.SEARCH_CONFIG_RE] = origPatternStr
            analysisItem[self.SEARCH_REGEXP] = utils.get_match_pattern(
                match)[:self.MAX_STR_LEN]
            analysisItem[self.SEARCH_MATCH] = utils.get_match_string(match)
            analysis.append(analysisItem)