def check(self): #Build the identifier using the filename and commit hashes identifier = '%s (%s)' % (self.filename, self.commit_hashes[1]) #The comments is a list to keep track of useful information #encountered when checking, right now, its only being used #to annotate when base64 code was removed comments = [] #Check the number of additions, if there are too many #send a warning and skip, this may be due to a big data file addition if self.error: return Result(self.filename, self.error) #Check if extension/mimetype is allowed if filetype.get_extension(self.filename) not in self.allowed_extensions: return Result(identifier, FILETYPE_NOT_ALLOWED) #Start applying rules... #First check if additions contain base64, if there is remove it has_base64, self.content = m.base64_matcher(self.content, remove=True) if has_base64: comments.append('BASE64_REMOVED') #Create matcher for amazonaws.com amazonaws_matcher = m.create_domain_matcher('amazonaws.com') #Apply matchers: password, ips and aws match, matches = m.multi_matcher(self.content, m.password_matcher, m.ip_matcher, amazonaws_matcher) if match: return Result(identifier, MATCH, matches=matches, comments=comments) else: return Result(identifier, NOT_MATCH, comments=comments)
def check(self): #Build the identifier using the filename and commit hashes identifier = '%s from commit %s to commit %s' % (self.filename, self.commit_hashes[0], self.commit_hashes[1]) #The commments is a list to keep track of useful information #encountered when checking, right now, its only being used #to annotate when base64 code was removed commments = [] #Git is smart enough to detect changes binary files when doing diff, #will not show any differences, only a message similar to this: #Binary files /dev/null and b/img.JPG differ #Check the number of additions, if there are too many #send a warning and skip, this may be due to a big data file addition #print 'Characters %d' % len(self.content) if self.error: return Result(self.filename, self.error) #Check file extension, if it's a text file continue, if it's not, #send a warning and skip #if filetype.mime_from_name(self.filename) is None: # return Result(self.filename, NOT_PLAIN_TEXT) #Check if extension/mimetype is allowed if filetype.get_extension(self.filename) not in self.allowed_extensions: return Result(identifier, FILETYPE_NOT_ALLOWED) #Start applying rules... #First check if additions contain base64, if there is remove it has_base64, self.content = matchers.base64_matcher(self.content, remove=True) if has_base64: commments.append('BASE64_REMOVED') #Now check for passwords has_pwd, matches = matchers.password_matcher(self.content) if has_pwd: return Result(identifier, MATCH, matches=matches, comments=commments) else: return Result(identifier, NOT_MATCH, comments=commments)
def check(self): #Build the identifier using the filename and commit hashes identifier = '%s (%s)' % (self.filename, self.commit_hashes[1]) #The comments is a list to keep track of useful information #encountered when checking, right now, its only being used #to annotate when base64 code was removed comments = [] #Check the number of additions, if there are too many #send a warning and skip, this may be due to a big data file addition if self.error: return Result(self.filename, self.error) #Check if extension/mimetype is allowed if filetype.get_extension( self.filename) not in self.allowed_extensions: return Result(identifier, FILETYPE_NOT_ALLOWED) #Start applying rules... #First check if additions contain base64, if there is remove it has_base64, self.content = m.base64_matcher(self.content, remove=True) if has_base64: comments.append('BASE64_REMOVED') #Create matcher for amazonaws.com amazonaws_matcher = m.create_domain_matcher('amazonaws.com') #Apply matchers: password, ips and aws match, matches = m.multi_matcher(self.content, m.password_matcher, m.ip_matcher, amazonaws_matcher) if match: return Result(identifier, MATCH, matches=matches, comments=comments) else: return Result(identifier, NOT_MATCH, comments=comments)
def check(self): #The comments is a list to keep track of useful information #encountered when checking, right now, its only being used #to annotate when base64 code was removed comments = [] #Check file size if it's more than max_file_size_bytes (default is 1MB) #send just a warning and do not open the file, #since pattern matching is going to be really slow f_size = os.stat(self.path).st_size if f_size > self.max_file_size_bytes: return Result(self.path, BIG_FILE) #Check if extension is allowed if filetype.get_extension(self.path) not in self.allowed_extensions: return Result(self.path, FILETYPE_NOT_ALLOWED) #At this point you only have files with allowed extensions and #smaller than max_file_size_bytes #open the file and then apply all rules with open(self.path, 'r') as f: content = f.read() #Last check: search for potential base64 strings and remove them, send a warning has_base64, content = m.base64_matcher(content, remove=True) if has_base64: comments.append('BASE64_REMOVED') #Create matcher for amazonaws.com amazonaws_matcher = m.create_domain_matcher('amazonaws.com') #Apply matchers: password, ips and aws match, matches = m.multi_matcher(content, m.password_matcher, m.ip_matcher, amazonaws_matcher) if match: return Result(self.path, MATCH, matches=matches, comments=comments) else: return Result(self.path, NOT_MATCH, comments=comments)
def check(self): #Check file size if it's more than 1MB #send just a warning and do not open the file, #since pattern matching is going to be really slow f_size = os.stat(self.path).st_size if f_size > 1048576L: return Result(self.path, BIG_FILE) #Then, filter all non-plain text files #also send a warning for those, if they are non-plain text #and less than 1MB they are probably xlsx, pdfs, pngs, zips, ppt, pptx if not self.mimetype.startswith('text/'): #Add checks for certain files? (word, excel, powerpoint...) return Result(self.path, NOT_PLAIN_TEXT) #Now, filter all files which mimetype could not be determined #At this point you only have plain text files, smaller than 1MB #open the file and then apply all rules with open(self.path, 'r') as f: content = f.read() #Last check: search for potential base64 strings and remove them, send a warning has_base64, content = matchers.base64_matcher(content, remove=True) if has_base64: print 'Removing base64 code...' #Maybe send warnings for data files (even if they are less than 1MB)? #First matcher: passwords password_matcher, matches = matchers.password_matcher(content) if password_matcher: return Result(self.path, MATCH, matches) else: return Result(self.path, NOT_MATCH)