Пример #1
0
 def test_is_ppt(self):
     """ test ppt_record_parser.is_ppt(filename) """
     exceptions = ['encrypted.ppt', ]     # actually is ppt but embedded
     for base_dir, _, files in os.walk(DATA_BASE_DIR):
         for filename in files:
             if filename in exceptions:
                 continue
             full_name = join(base_dir, filename)
             extn = splitext(filename)[1]
             if extn in ('.ppt', '.pps', '.pot'):
                 self.assertTrue(ppt_record_parser.is_ppt(full_name),
                                 msg='{0} not recognized as ppt file'
                                     .format(full_name))
             else:
                 self.assertFalse(ppt_record_parser.is_ppt(full_name),
                                  msg='{0} erroneously recognized as ppt'
                                      .format(full_name))
Пример #2
0
 def test_is_ppt(self):
     """ test ppt_record_parser.is_ppt(filename) """
     exceptions = []
     for base_dir, _, files in os.walk(DATA_BASE_DIR):
         for filename in files:
             if filename in exceptions:
                 continue
             full_name = join(base_dir, filename)
             extn = splitext(filename)[1]
             if extn in ('.ppt', '.pps', '.pot'):
                 self.assertTrue(
                     ppt_record_parser.is_ppt(full_name),
                     msg='{0} not recognized as ppt file'.format(full_name))
             else:
                 self.assertFalse(
                     ppt_record_parser.is_ppt(full_name),
                     msg='{0} erroneously recognized as ppt'.format(
                         full_name))
Пример #3
0
def process_file(filepath, field_filter_mode=None):
    """ decides which of the process_* functions to call """
    if olefile.isOleFile(filepath):
        logger.debug('Is OLE. Checking streams to see whether this is xls')
        if xls_parser.is_xls(filepath):
            logger.debug('Process file as excel 2003 (xls)')
            return process_xls(filepath)

        ole = olefile.OleFileIO(filepath, path_encoding=None)
        if is_ppt(ole):
            logger.debug('is ppt - cannot have DDE')
            return u''
        logger.debug('Process file as word 2003 (doc)')
        return process_doc(ole)

    with open(filepath, 'rb') as file_handle:
        if file_handle.read(4) == RTF_START:
            logger.debug('Process file as rtf')
            return process_rtf(file_handle, field_filter_mode)

    try:
        doctype = ooxml.get_type(filepath)
        logger.debug('Detected file type: {0}'.format(doctype))
    except Exception as exc:
        logger.debug('Exception trying to xml-parse file: {0}'.format(exc))
        doctype = None

    if doctype == ooxml.DOCTYPE_EXCEL:
        logger.debug('Process file as excel 2007+ (xlsx)')
        return process_xlsx(filepath)
    if doctype in (ooxml.DOCTYPE_EXCEL_XML, ooxml.DOCTYPE_EXCEL_XML2003):
        logger.debug('Process file as xml from excel 2003/2007+')
        return process_excel_xml(filepath)
    if doctype in (ooxml.DOCTYPE_WORD_XML, ooxml.DOCTYPE_WORD_XML2003):
        logger.debug('Process file as xml from word 2003/2007+')
        return process_docx(filepath)
    if doctype is None:
        logger.debug('Process file as csv')
        return process_csv(filepath)
    # could be docx; if not: this is the old default code path
    logger.debug('Process file as word 2007+ (docx)')
    return process_docx(filepath, field_filter_mode)
Пример #4
0
def process_file(filepath, field_filter_mode=None):
    """decides which of the process_* functions to call"""
    if olefile.isOleFile(filepath):
        logger.debug("Is OLE. Checking streams to see whether this is xls")
        if xls_parser.is_xls(filepath):
            logger.debug("Process file as excel 2003 (xls)")
            return process_xls(filepath)
        if is_ppt(filepath):
            logger.debug("is ppt - cannot have DDE")
            return u""
        logger.debug("Process file as word 2003 (doc)")
        with olefile.OleFileIO(filepath, path_encoding=None) as ole:
            return process_doc(ole)

    with open(filepath, "rb") as file_handle:
        # TODO: here we should not assume this is a file on disk, filepath can be a file object
        if file_handle.read(4) == RTF_START:
            logger.debug("Process file as rtf")
            return process_rtf(file_handle, field_filter_mode)

    try:
        doctype = ooxml.get_type(filepath)
        logger.debug("Detected file type: {0}".format(doctype))
    except Exception as exc:
        logger.debug("Exception trying to xml-parse file: {0}".format(exc))
        doctype = None

    if doctype == ooxml.DOCTYPE_EXCEL:
        logger.debug("Process file as excel 2007+ (xlsx)")
        return process_xlsx(filepath)
    if doctype in (ooxml.DOCTYPE_EXCEL_XML, ooxml.DOCTYPE_EXCEL_XML2003):
        logger.debug("Process file as xml from excel 2003/2007+")
        return process_excel_xml(filepath)
    if doctype in (ooxml.DOCTYPE_WORD_XML, ooxml.DOCTYPE_WORD_XML2003):
        logger.debug("Process file as xml from word 2003/2007+")
        return process_docx(filepath)
    if doctype is None:
        logger.debug("Process file as csv")
        return process_csv(filepath)
    # could be docx; if not: this is the old default code path
    logger.debug("Process file as word 2007+ (docx)")
    return process_docx(filepath, field_filter_mode)
Пример #5
0
def find_ole(filename, data, xml_parser=None):
    """ try to open somehow as zip/ole/rtf/... ; yield None if fail

    If data is given, filename is (mostly) ignored.

    yields embedded ole streams in form of OleFileIO.
    """

    if data is not None:
        # isOleFile and is_ppt can work on data directly but zip need file
        # --> wrap data in a file-like object without copying data
        log.debug('working on data, file is not touched below')
        arg_for_ole = data
        arg_for_zip = FakeFile(data)
    else:
        # we only have a file name
        log.debug('working on file by name')
        arg_for_ole = filename
        arg_for_zip = filename

    ole = None
    try:
        if olefile.isOleFile(arg_for_ole):
            if is_ppt(arg_for_ole):
                log.info('is ppt file: ' + filename)
                for ole in find_ole_in_ppt(arg_for_ole):
                    yield ole
                    ole = None  # is closed in find_ole_in_ppt
            # in any case: check for embedded stuff in non-sectored streams
            log.info('is ole file: ' + filename)
            ole = olefile.OleFileIO(arg_for_ole)
            yield ole
        elif xml_parser is not None or is_zipfile(arg_for_zip):
            # keep compatibility with 3rd-party code that calls this function
            # directly without providing an XmlParser instance
            if xml_parser is None:
                xml_parser = XmlParser(arg_for_zip)
                # force iteration so XmlParser.iter_non_xml() returns data
                [x for x in xml_parser.iter_xml()]

            log.info('is zip file: ' + filename)
            # we looped through the XML files before, now we can
            # iterate the non-XML files looking for ole objects
            for subfile, _, file_handle in xml_parser.iter_non_xml():
                try:
                    head = file_handle.read(len(olefile.MAGIC))
                except RuntimeError:
                    log.error('zip is encrypted: ' + filename)
                    yield None
                    continue

                if head == olefile.MAGIC:
                    file_handle.seek(0)
                    log.info('  unzipping ole: ' + subfile)
                    try:
                        ole = olefile.OleFileIO(file_handle)
                        yield ole
                    except IOError:
                        log.warning('Error reading data from {0}/{1} or '
                                    'interpreting it as OLE object'.format(
                                        filename, subfile))
                        log.debug('', exc_info=True)
                    finally:
                        if ole is not None:
                            ole.close()
                            ole = None
                else:
                    log.debug('unzip skip: ' + subfile)
        else:
            log.warning(
                'open failed: {0} (or its data) is neither zip nor OLE'.format(
                    filename))
            yield None
    except Exception:
        log.error('Caught exception opening {0}'.format(filename),
                  exc_info=True)
        yield None
    finally:
        if ole is not None:
            ole.close()
Пример #6
0
def find_ole(filename, data):
    """ try to open somehow as zip/ole/rtf/... ; yield None if fail

    If data is given, filename is (mostly) ignored.

    yields embedded ole streams in form of OleFileIO.
    """

    if data is not None:
        # isOleFile and is_ppt can work on data directly but zip need file
        # --> wrap data in a file-like object without copying data
        log.debug('working on data, file is not touched below')
        arg_for_ole = data
        arg_for_zip = FakeFile(data)
    else:
        # we only have a file name
        log.debug('working on file by name')
        arg_for_ole = filename
        arg_for_zip = filename

    ole = None
    try:
        if olefile.isOleFile(arg_for_ole):
            if is_ppt(arg_for_ole):
                log.info('is ppt file: ' + filename)
                for ole in find_ole_in_ppt(arg_for_ole):
                    yield ole
                    ole = None  # is closed in find_ole_in_ppt
            # in any case: check for embedded stuff in non-sectored streams
            log.info('is ole file: ' + filename)
            ole = olefile.OleFileIO(arg_for_ole)
            yield ole
        elif is_zipfile(arg_for_zip):
            log.info('is zip file: ' + filename)
            zipper = ZipFile(arg_for_zip, 'r')
            for subfile in zipper.namelist():
                head = b''
                try:
                    with zipper.open(subfile) as file_handle:
                        head = file_handle.read(len(olefile.MAGIC))
                except RuntimeError:
                    log.error('zip is encrypted: ' + filename)
                    yield None
                    continue

                if head == olefile.MAGIC:
                    log.info('  unzipping ole: ' + subfile)
                    with ZipSubFile(zipper, subfile) as file_handle:
                        try:
                            ole = olefile.OleFileIO(file_handle)
                            yield ole
                        except IOError:
                            log.warning('Error reading data from {0}/{1} or '
                                        'interpreting it as OLE object'.format(
                                            filename, subfile))
                            log.debug('', exc_info=True)
                        finally:
                            if ole is not None:
                                ole.close()
                                ole = None
                else:
                    log.debug('unzip skip: ' + subfile)
        else:
            log.warning(
                'open failed: {0} (or its data) is neither zip nor OLE'.format(
                    filename))
            yield None
    except Exception:
        log.error('Caught exception opening {0}'.format(filename),
                  exc_info=True)
        yield None
    finally:
        if ole is not None:
            ole.close()
Пример #7
0
def find_ole(filename, data, xml_parser=None):
    """ try to open somehow as zip/ole/rtf/... ; yield None if fail

    If data is given, filename is (mostly) ignored.

    yields embedded ole streams in form of OleFileIO.
    """

    if data is not None:
        # isOleFile and is_ppt can work on data directly but zip need file
        # --> wrap data in a file-like object without copying data
        log.debug('working on data, file is not touched below')
        arg_for_ole = data
        arg_for_zip = FakeFile(data)
    else:
        # we only have a file name
        log.debug('working on file by name')
        arg_for_ole = filename
        arg_for_zip = filename

    ole = None
    try:
        if olefile.isOleFile(arg_for_ole):
            if is_ppt(arg_for_ole):
                log.info('is ppt file: ' + filename)
                for ole in find_ole_in_ppt(arg_for_ole):
                    yield ole
                    ole = None   # is closed in find_ole_in_ppt
            # in any case: check for embedded stuff in non-sectored streams
            log.info('is ole file: ' + filename)
            ole = olefile.OleFileIO(arg_for_ole)
            yield ole
        elif xml_parser is not None or is_zipfile(arg_for_zip):
            # keep compatibility with 3rd-party code that calls this function
            # directly without providing an XmlParser instance
            if xml_parser is None:
                xml_parser = XmlParser(arg_for_zip)
                # force iteration so XmlParser.iter_non_xml() returns data
                [x for x in xml_parser.iter_xml()]

            log.info('is zip file: ' + filename)
            # we looped through the XML files before, now we can
            # iterate the non-XML files looking for ole objects
            for subfile, _, file_handle in xml_parser.iter_non_xml():
                try:
                    head = file_handle.read(len(olefile.MAGIC))
                except RuntimeError:
                    log.error('zip is encrypted: ' + filename)
                    yield None
                    continue

                if head == olefile.MAGIC:
                    file_handle.seek(0)
                    log.info('  unzipping ole: ' + subfile)
                    try:
                        ole = olefile.OleFileIO(file_handle)
                        yield ole
                    except IOError:
                        log.warning('Error reading data from {0}/{1} or '
                                    'interpreting it as OLE object'
                                    .format(filename, subfile))
                        log.debug('', exc_info=True)
                    finally:
                        if ole is not None:
                            ole.close()
                            ole = None
                else:
                    log.debug('unzip skip: ' + subfile)
        else:
            log.warning('open failed: {0} (or its data) is neither zip nor OLE'
                        .format(filename))
            yield None
    except Exception:
        log.error('Caught exception opening {0}'.format(filename),
                  exc_info=True)
        yield None
    finally:
        if ole is not None:
            ole.close()