Пример #1
0
    def check_regexs(self, regexs, search_extensions, enable_pdf):
        """Checks the file for matching regular expressions: if a ZIP then each file in the ZIP (recursively) or the text in a document"""

        if self.type == 'ZIP':
            try:
                if get_ext(self.path) == '.docx':
                    doctext = docx2txt.process(self.path)
                    self.check_text_regexs(doctext, regexs, '')

                if zipfile.is_zipfile(self.path):
                    zf = zipfile.ZipFile(self.path)
                    self.check_zip_regexs(zf, regexs, search_extensions,
                                          enable_pdf, '')
                else:
                    self.set_error('Invalid ZIP file')
            except IOError:
                self.set_error(sys.exc_info()[1])
            except:
                self.set_error(sys.exc_info()[1])

        elif self.type == 'TEXT':
            try:
                file_text = read_file(self.path, 'rb')
                self.check_text_regexs(file_text, regexs, '')
            except WindowsError:
                self.set_error(sys.exc_info()[1])
            except IOError:
                self.set_error(sys.exc_info()[1])
            except:
                self.set_error(sys.exc_info()[1])

        elif self.type == 'SPECIAL':
            if get_ext(self.path) == '.msg':
                try:
                    msg = msmsg.MSMSG(self.path)
                    if msg.validMSG:
                        self.check_msg_regexs(msg, regexs, search_extensions,
                                              enable_pdf, '')
                    else:
                        self.set_error('Invalid MSG file')
                    msg.close()
                except IOError:
                    self.set_error(sys.exc_info()[1])
                except:
                    self.set_error(sys.exc_info()[1])
            if enable_pdf:
                if get_ext(self.path) == '.pdf':
                    try:
                        pdf = pdfquery.PDFQuery(self.path)
                        pdf.load()
                        self.check_pdf_regexs(pdf, regexs, '')
                    except:
                        self.set_error(sys.exc_info()[1])
            if get_ext(self.path) == '.mdb':
                try:
                    self.check_access_regexs(self.path, 'mdb', regexs)
                except:
                    self.set_error(sys.exc_info()[1])

        return self.matches
Пример #2
0
    def check_zip_regexs(self, zf, regexs, search_extensions, enable_pdf,
                         sub_path):
        """Checks a zip file for valid documents that are then checked for regexs"""

        all_extensions = search_extensions['TEXT'] + search_extensions[
            'ZIP'] + search_extensions['SPECIAL']

        files_in_zip = [
            file_in_zip for file_in_zip in zf.namelist()
            if get_ext(file_in_zip) in all_extensions
        ]
        for file_in_zip in files_in_zip:
            if get_ext(file_in_zip
                       ) in search_extensions['ZIP']:  # nested zip file
                try:
                    memory_zip = cStringIO.StringIO()
                    memory_zip.write(zf.open(file_in_zip).read())
                    nested_zf = zipfile.ZipFile(memory_zip)
                    self.check_zip_regexs(
                        nested_zf, regexs, search_extensions, enable_pdf,
                        os.path.join(sub_path,
                                     decode_zip_filename(file_in_zip)))
                    memory_zip.close()
                except:  #RuntimeError: # e.g. zip needs password
                    self.set_error(sys.exc_info()[1])
            elif get_ext(
                    file_in_zip) in search_extensions['TEXT']:  #normal doc
                try:
                    file_text = zf.open(file_in_zip).read()
                    self.check_text_regexs(
                        file_text, regexs,
                        os.path.join(sub_path,
                                     decode_zip_filename(file_in_zip)))
                except:  # RuntimeError: # e.g. zip needs password
                    self.set_error(sys.exc_info()[1])
            else:  # SPECIAL
                try:
                    if get_ext(file_in_zip) == '.msg':
                        memory_msg = cStringIO.StringIO()
                        memory_msg.write(zf.open(file_in_zip).read())
                        msg = msmsg.MSMSG(memory_msg)
                        if msg.validMSG:
                            self.check_msg_regexs(
                                msg, regexs, search_extensions, enable_pdf,
                                os.path.join(sub_path,
                                             decode_zip_filename(file_in_zip)))
                        memory_msg.close()
                except:  #RuntimeError
                    self.set_error(sys.exc_info()[1])
Пример #3
0
    def check_regexs(self, regexs, search_extensions):
        """Checks the file for matching regular expressions: if a ZIP then each file in the ZIP (recursively) or the text in a document"""

        if self.type == 'ZIP':
            try:
                if zipfile.is_zipfile(self.path):
                    zf = zipfile.ZipFile(self.path)
                    self.check_zip_regexs(zf, regexs, search_extensions, '')
                else:
                    self.set_error('Invalid ZIP file')
            except IOError:
                self.set_error(sys.exc_info()[1])
            except:
                self.set_error(sys.exc_info()[1])

        elif self.type == 'TEXT':
            try:
                file_text = read_file(self.path, 'rb')
                self.check_text_regexs(file_text, regexs, '')
            except WindowsError:
                self.set_error(sys.exc_info()[1])
            except IOError:
                self.set_error(sys.exc_info()[1])
            except:
                self.set_error(sys.exc_info()[1])

        elif self.type == 'SPECIAL':
            if get_ext(self.path) == '.msg':
                try:
                    msg = msmsg.MSMSG(self.path)
                    if msg.validMSG:
                        self.check_msg_regexs(msg, regexs, search_extensions,
                                              '')
                    else:
                        self.set_error('Invalid MSG file')
                    msg.close()
                except IOError:
                    self.set_error(sys.exc_info()[1])
                except:
                    self.set_error(sys.exc_info()[1])

        return self.matches