def check_regexs(self, regexs, search_extensions, enable_pdf): """Checks the file for matching regular expressions: if a ZIP then each file in the ZIP (recursively) or the text in a document""" if self.type == 'ZIP': try: if get_ext(self.path) == '.docx': doctext = docx2txt.process(self.path) self.check_text_regexs(doctext, regexs, '') if zipfile.is_zipfile(self.path): zf = zipfile.ZipFile(self.path) self.check_zip_regexs(zf, regexs, search_extensions, enable_pdf, '') else: self.set_error('Invalid ZIP file') except IOError: self.set_error(sys.exc_info()[1]) except: self.set_error(sys.exc_info()[1]) elif self.type == 'TEXT': try: file_text = read_file(self.path, 'rb') self.check_text_regexs(file_text, regexs, '') except WindowsError: self.set_error(sys.exc_info()[1]) except IOError: self.set_error(sys.exc_info()[1]) except: self.set_error(sys.exc_info()[1]) elif self.type == 'SPECIAL': if get_ext(self.path) == '.msg': try: msg = msmsg.MSMSG(self.path) if msg.validMSG: self.check_msg_regexs(msg, regexs, search_extensions, enable_pdf, '') else: self.set_error('Invalid MSG file') msg.close() except IOError: self.set_error(sys.exc_info()[1]) except: self.set_error(sys.exc_info()[1]) if enable_pdf: if get_ext(self.path) == '.pdf': try: pdf = pdfquery.PDFQuery(self.path) pdf.load() self.check_pdf_regexs(pdf, regexs, '') except: self.set_error(sys.exc_info()[1]) if get_ext(self.path) == '.mdb': try: self.check_access_regexs(self.path, 'mdb', regexs) except: self.set_error(sys.exc_info()[1]) return self.matches
def check_zip_regexs(self, zf, regexs, search_extensions, enable_pdf, sub_path): """Checks a zip file for valid documents that are then checked for regexs""" all_extensions = search_extensions['TEXT'] + search_extensions[ 'ZIP'] + search_extensions['SPECIAL'] files_in_zip = [ file_in_zip for file_in_zip in zf.namelist() if get_ext(file_in_zip) in all_extensions ] for file_in_zip in files_in_zip: if get_ext(file_in_zip ) in search_extensions['ZIP']: # nested zip file try: memory_zip = cStringIO.StringIO() memory_zip.write(zf.open(file_in_zip).read()) nested_zf = zipfile.ZipFile(memory_zip) self.check_zip_regexs( nested_zf, regexs, search_extensions, enable_pdf, os.path.join(sub_path, decode_zip_filename(file_in_zip))) memory_zip.close() except: #RuntimeError: # e.g. zip needs password self.set_error(sys.exc_info()[1]) elif get_ext( file_in_zip) in search_extensions['TEXT']: #normal doc try: file_text = zf.open(file_in_zip).read() self.check_text_regexs( file_text, regexs, os.path.join(sub_path, decode_zip_filename(file_in_zip))) except: # RuntimeError: # e.g. zip needs password self.set_error(sys.exc_info()[1]) else: # SPECIAL try: if get_ext(file_in_zip) == '.msg': memory_msg = cStringIO.StringIO() memory_msg.write(zf.open(file_in_zip).read()) msg = msmsg.MSMSG(memory_msg) if msg.validMSG: self.check_msg_regexs( msg, regexs, search_extensions, enable_pdf, os.path.join(sub_path, decode_zip_filename(file_in_zip))) memory_msg.close() except: #RuntimeError self.set_error(sys.exc_info()[1])
def check_regexs(self, regexs, search_extensions): """Checks the file for matching regular expressions: if a ZIP then each file in the ZIP (recursively) or the text in a document""" if self.type == 'ZIP': try: if zipfile.is_zipfile(self.path): zf = zipfile.ZipFile(self.path) self.check_zip_regexs(zf, regexs, search_extensions, '') else: self.set_error('Invalid ZIP file') except IOError: self.set_error(sys.exc_info()[1]) except: self.set_error(sys.exc_info()[1]) elif self.type == 'TEXT': try: file_text = read_file(self.path, 'rb') self.check_text_regexs(file_text, regexs, '') except WindowsError: self.set_error(sys.exc_info()[1]) except IOError: self.set_error(sys.exc_info()[1]) except: self.set_error(sys.exc_info()[1]) elif self.type == 'SPECIAL': if get_ext(self.path) == '.msg': try: msg = msmsg.MSMSG(self.path) if msg.validMSG: self.check_msg_regexs(msg, regexs, search_extensions, '') else: self.set_error('Invalid MSG file') msg.close() except IOError: self.set_error(sys.exc_info()[1]) except: self.set_error(sys.exc_info()[1]) return self.matches