def is_text_file(fileObject): textChars = string.letters + string.digits + string.punctuation + string.whitespace bytesToCheck = 128 # Big enough window to grab, but small for speed startPoint = 4 # Skip start of file, for hidden text codes minWindowSize = 32 # Get a big enough min window to be feasible nonTextThreshold = 0.2 # Have some tolerance to avoid false positives # Grab the first bytes of the file, STRIPPING NULLS (for unicode text files) fileBytes = utils.strip_null_chars( utils.get_file_start(fileObject, bytesToCheck)) # Special case for PDF that looks like text but isn't if is_pdf_file(fileObject): isBelowThreshold = False trace.file( 3, " IsTextFile({0}): {1} ==> PDF File detected".format( isBelowThreshold, os.path.basename(fileObject.name))) else: isBelowThreshold = utils.check_bytes_below_threshold( fileBytes, textChars, minWindowSize, startPoint, nonTextThreshold) trace.file( 3, " IsTextFile({0}): {1} ==> {2}".format( isBelowThreshold, os.path.basename(fileObject.name), fileBytes)) return isBelowThreshold
def _preprocess_line(self, line): ''' Cut line down to the maximum allowed length Remove null characters that can occur with multibyte file formats Can be overriden if multibyte needs to be preserved ''' return utils.strip_null_chars(line[:self.maxLineLength])
def _measure_text(self, fileObject, measurements): ''' Default handler For text based files, go through each file line ''' for rawLine in fileObject: self.totalLines += 1 line = utils.strip_null_chars(rawLine) # Detect blank lines if self.reBlankLine.match(line): self.blankLines += 1 continue # Content line self.contentLines += 1
def _alternate_line_processing(self, line): ''' Check the start of the line to detect a binary part of a PBL file Some of this processing is really only for PBL files, but it will be harmless for exported files Return value to determine whether we do any more survreyor processing on line ''' stopProcessingLine = True if super(customPowerBuilder, self)._alternate_line_processing(line): return stopProcessingLine line = utils.strip_null_chars(line) if self.reTrueBlankLine.match(line): return False isTextLine = True if self._isPblFile: line = self._clean_PB_tokens(line) windowSize = 80 textChars = string.letters + string.digits + string.whitespace + '~$\\/-_<>=():*|;,\"' startPoint = 1 minWindowSize = 15 threshold = 0.2 isTextLine = utils.check_bytes_below_threshold( line.lstrip()[:windowSize], textChars, minWindowSize, startPoint, threshold) line = utils.strip_extended_chars(line) if isTextLine: if self._is_generated_line(line): self.counts['PbGenLines'][self._activeBlock] += 1 self._write_out_line('Gen', line) else: # At this point, we have a line of code we want to process # We separate out routines and tables from generic code stopProcessingLine = False if not self._process_PB_routine(line): if self._in_PB_table(line): self._write_out_line('Tables', line) else: self._write_out_line('Code', line) else: self.counts['PbBinLines'][self._activeBlock] += 1 self._write_out_line('Bin', line) return stopProcessingLine
def is_text_file(fileObject): textChars = string.letters + string.digits + string.punctuation + string.whitespace bytesToCheck = 128 # Big enough window to grab, but small for speed startPoint = 4 # Skip start of file, for hidden text codes minWindowSize = 32 # Get a big enough min window to be feasible nonTextThreshold = 0.2 # Have some tolerance to avoid false positives # Grab the first bytes of the file, STRIPPING NULLS (for unicode text files) fileBytes = utils.strip_null_chars(utils.get_file_start(fileObject, bytesToCheck)) isBelowThreshold = utils.check_bytes_below_threshold( fileBytes, textChars, minWindowSize, startPoint, nonTextThreshold) trace.file(3," IsTextFile({0}): {1} ==> {2}".format( isBelowThreshold, os.path.basename(fileObject.name), fileBytes)) return isBelowThreshold
def _measure_text(self, fileObject, measurements): ''' Default handler For text based files, go through each file line ''' if self._traceLevel: trace.file(4, "Document: {0}".format(fileObject)) for rawLine in fileObject: self.totalLines += 1 line = utils.strip_null_chars(rawLine) # Detect blank lines if self.reBlankLine.match(line): self.blankLines += 1 continue # Content line self.contentLines += 1
def _search(self, lines, configEntry, measurements, analysis): ''' Loop through the lines, comparing each aginst both the positve and negative list of search strings provided. ''' positiveSearches, negativeSearches = self._setup_search_strings( configEntry.paramsProcessed) val_TotalHits = 0 val_TotalLines = 0 try: for rawLine in lines: line = utils.strip_null_chars(rawLine) val_TotalLines += 1 matchTuple = self._first_match(line, positiveSearches, negativeSearches) if matchTuple: origPatternStr, match = matchTuple val_TotalHits += 1 # We may be searching binaries, so take some steps to clean up # the line string we export cleanSearchLine = line.strip() cleanSearchLine = cleanSearchLine[:self.MAX_STR_LEN] cleanSearchLine = utils.safe_ascii_string(cleanSearchLine) cleanSearchLine = utils.strip_annoying_chars( cleanSearchLine) # Export the findings analysisItem = {} analysisItem[ self.SEARCH_LINE] = cleanSearchLine[:self.MAX_STR_LEN] analysisItem[self.SEARCH_LINENUM] = val_TotalLines analysisItem[self.SEARCH_CONFIG_RE] = origPatternStr analysisItem[self.SEARCH_REGEXP] = utils.get_match_pattern( match)[:self.MAX_STR_LEN] analysisItem[self.SEARCH_MATCH] = utils.get_match_string( match)[:self.MAX_STR_LEN] analysis.append(analysisItem) except Exception, e: raise utils.CsModuleException( "Error {0}\n...searching line: {1}".format( str(e), str(val_TotalLines)))
def _alternate_line_processing(self, line): ''' Check the start of the line to detect a binary part of a PBL file Some of this processing is really only for PBL files, but it will be harmless for exported files Return value to determine whether we do any more survreyor processing on line ''' stopProcessingLine = True if super(customPowerBuilder, self)._alternate_line_processing(line): return stopProcessingLine line = utils.strip_null_chars(line) if self.reTrueBlankLine.match(line): return False isTextLine = True if self._isPblFile: line = self._clean_PB_tokens(line) isTextLine = not utils.is_str_binary(line) line = utils.strip_extended_chars(line) if isTextLine: if self._is_generated_line(line): self.counts['MeasureLines'][self.MACHINE] += 1 self._write_out_line('Gen', line) else: # At this point, we have a line of code we want to process # We separate out routines and tables from generic code stopProcessingLine = False if not self._process_PB_routine(line): if self._in_PB_table(line): self._write_out_line('Tables', line) else: self._write_out_line('Code', line) else: self.counts['PbBinLines'][self._activeBlock] += 1 self._write_out_line('Bin', line) return stopProcessingLine
def _search_multi(self, lines, configEntry, measurements, analysis): ''' Use multi-line searches ''' # Make sure lines represents the text of the file try: lines = lines.read() except AttributeError: pass lines = utils.strip_null_chars(lines) positiveSearches, negativeSearches = self._setup_search_strings( configEntry.paramsProcessed) matchTuple = self._first_match(lines, positiveSearches, negativeSearches) if matchTuple: origPatternStr, match = matchTuple analysisItem = {} analysisItem[self.SEARCH_CONFIG_RE] = origPatternStr analysisItem[self.SEARCH_REGEXP] = utils.get_match_pattern( match)[:self.MAX_STR_LEN] analysisItem[self.SEARCH_MATCH] = utils.get_match_string(match) analysis.append(analysisItem)